Instructions to use Gege24/envgnr-Qwen3b-hyperGG-commit-1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/envgnr-Qwen3b-hyperGG-commit-1 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("/cache/models/Qwen--Qwen2-7B-Instruct") model = PeftModel.from_pretrained(base_model, "Gege24/envgnr-Qwen3b-hyperGG-commit-1") - Transformers
How to use Gege24/envgnr-Qwen3b-hyperGG-commit-1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/envgnr-Qwen3b-hyperGG-commit-1") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Gege24/envgnr-Qwen3b-hyperGG-commit-1", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gege24/envgnr-Qwen3b-hyperGG-commit-1 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/envgnr-Qwen3b-hyperGG-commit-1" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/envgnr-Qwen3b-hyperGG-commit-1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/envgnr-Qwen3b-hyperGG-commit-1
- SGLang
How to use Gege24/envgnr-Qwen3b-hyperGG-commit-1 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/envgnr-Qwen3b-hyperGG-commit-1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/envgnr-Qwen3b-hyperGG-commit-1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/envgnr-Qwen3b-hyperGG-commit-1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/envgnr-Qwen3b-hyperGG-commit-1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/envgnr-Qwen3b-hyperGG-commit-1 with Docker Model Runner:
docker model run hf.co/Gege24/envgnr-Qwen3b-hyperGG-commit-1
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.0093, | |
| "eval_steps": 500, | |
| "global_step": 465, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 822.0, | |
| "completions/max_terminated_length": 822.0, | |
| "completions/mean_length": 741.390625, | |
| "completions/mean_terminated_length": 741.390625, | |
| "completions/min_length": 296.0, | |
| "completions/min_terminated_length": 296.0, | |
| "entropy": 0.19560225727036595, | |
| "epoch": 2e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6278125643730164, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0026, | |
| "num_tokens": 102275.0, | |
| "reward": -0.2660611569881439, | |
| "reward_std": 9.006877899169922, | |
| "rewards/rollout_reward_func/mean": -0.26606130599975586, | |
| "rewards/rollout_reward_func/std": 10.133543014526367, | |
| "sampling/importance_sampling_ratio/max": 1.4521965980529785, | |
| "sampling/importance_sampling_ratio/mean": 1.0252978801727295, | |
| "sampling/importance_sampling_ratio/min": 0.6192880272865295, | |
| "sampling/sampling_logp_difference/max": 0.35935235023498535, | |
| "sampling/sampling_logp_difference/mean": 0.013161457143723965, | |
| "step": 1, | |
| "step_time": 18.950907858999926 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 0.19560225727036595, | |
| "epoch": 4e-05, | |
| "grad_norm": 0.6270994544029236, | |
| "kl": 0.0, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 0.0026, | |
| "step": 2, | |
| "step_time": 6.845600487000297 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0026041667442768812, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 817.0, | |
| "completions/max_terminated_length": 817.0, | |
| "completions/mean_length": 745.078125, | |
| "completions/mean_terminated_length": 745.078125, | |
| "completions/min_length": 290.0, | |
| "completions/min_terminated_length": 290.0, | |
| "entropy": 0.1830942602828145, | |
| "epoch": 6e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6748153567314148, | |
| "kl": 0.0004804102204616356, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": -0.0139, | |
| "num_tokens": 204643.0, | |
| "reward": 0.07987305521965027, | |
| "reward_std": 6.112407207489014, | |
| "rewards/rollout_reward_func/mean": 0.07987302541732788, | |
| "rewards/rollout_reward_func/std": 6.9746317863464355, | |
| "sampling/importance_sampling_ratio/max": 1.6137751340866089, | |
| "sampling/importance_sampling_ratio/mean": 1.0131056308746338, | |
| "sampling/importance_sampling_ratio/min": 0.5117371678352356, | |
| "sampling/sampling_logp_difference/max": 0.6347737312316895, | |
| "sampling/sampling_logp_difference/mean": 0.013132400810718536, | |
| "step": 3, | |
| "step_time": 20.457778603000065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0062500000931322575, | |
| "clip_ratio/high_mean": 0.0015625000232830644, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002864583395421505, | |
| "entropy": 0.18449228629469872, | |
| "epoch": 8e-05, | |
| "grad_norm": 0.7855743169784546, | |
| "kl": 0.0004326992366259219, | |
| "learning_rate": 8.571428571428573e-06, | |
| "loss": -0.0127, | |
| "step": 4, | |
| "step_time": 7.153126219000001 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625000465661287, | |
| "clip_ratio/high_mean": 0.003906250116415322, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003906250116415322, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 773.3125, | |
| "completions/mean_terminated_length": 773.3125, | |
| "completions/min_length": 691.0, | |
| "completions/min_terminated_length": 691.0, | |
| "entropy": 0.19608404766768217, | |
| "epoch": 0.0001, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6154729723930359, | |
| "kl": 0.0007404440693790093, | |
| "learning_rate": 1.1428571428571429e-05, | |
| "loss": -0.0267, | |
| "num_tokens": 308926.0, | |
| "reward": -2.357975721359253, | |
| "reward_std": 5.998347282409668, | |
| "rewards/rollout_reward_func/mean": -2.357975721359253, | |
| "rewards/rollout_reward_func/std": 6.508192539215088, | |
| "sampling/importance_sampling_ratio/max": 1.5696072578430176, | |
| "sampling/importance_sampling_ratio/mean": 1.0018606185913086, | |
| "sampling/importance_sampling_ratio/min": 0.6378414630889893, | |
| "sampling/sampling_logp_difference/max": 0.4687232971191406, | |
| "sampling/sampling_logp_difference/mean": 0.014497373253107071, | |
| "step": 5, | |
| "step_time": 21.077881563999767 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0027225379599258304, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005326704704202712, | |
| "entropy": 0.20008834172040224, | |
| "epoch": 0.00012, | |
| "grad_norm": 0.613211989402771, | |
| "kl": 0.0017206422435265267, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": -0.0283, | |
| "step": 6, | |
| "step_time": 8.075609097999632 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 843.0, | |
| "completions/max_terminated_length": 843.0, | |
| "completions/mean_length": 754.3125, | |
| "completions/mean_terminated_length": 754.3125, | |
| "completions/min_length": 302.0, | |
| "completions/min_terminated_length": 302.0, | |
| "entropy": 0.21214309986680746, | |
| "epoch": 0.00014, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5866758823394775, | |
| "kl": 0.0038609652619925328, | |
| "learning_rate": 1.7142857142857145e-05, | |
| "loss": 0.0015, | |
| "num_tokens": 413194.0, | |
| "reward": -0.5192327499389648, | |
| "reward_std": 8.747434616088867, | |
| "rewards/rollout_reward_func/mean": -0.5192328095436096, | |
| "rewards/rollout_reward_func/std": 9.696125030517578, | |
| "sampling/importance_sampling_ratio/max": 1.3741450309753418, | |
| "sampling/importance_sampling_ratio/mean": 0.988805890083313, | |
| "sampling/importance_sampling_ratio/min": 0.6078794002532959, | |
| "sampling/sampling_logp_difference/max": 0.25654804706573486, | |
| "sampling/sampling_logp_difference/mean": 0.012450095266103745, | |
| "step": 7, | |
| "step_time": 21.18102895699974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.042140152771025896, | |
| "clip_ratio/high_mean": 0.010535038192756474, | |
| "clip_ratio/low_mean": 0.011718750349245965, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02225378854200244, | |
| "entropy": 0.2212895406410098, | |
| "epoch": 0.00016, | |
| "grad_norm": 0.5727657675743103, | |
| "kl": 0.01148045047011692, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "step": 8, | |
| "step_time": 8.206865795999875 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.004142992664128542, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006747159408405423, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 822.0, | |
| "completions/max_terminated_length": 822.0, | |
| "completions/mean_length": 742.765625, | |
| "completions/mean_terminated_length": 742.765625, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "entropy": 0.25475312024354935, | |
| "epoch": 0.00018, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7087565064430237, | |
| "kl": 0.03194100991822779, | |
| "learning_rate": 2.2857142857142858e-05, | |
| "loss": 0.0241, | |
| "num_tokens": 516181.0, | |
| "reward": -2.378840684890747, | |
| "reward_std": 6.36100959777832, | |
| "rewards/rollout_reward_func/mean": -2.378840446472168, | |
| "rewards/rollout_reward_func/std": 7.315836429595947, | |
| "sampling/importance_sampling_ratio/max": 1.6080894470214844, | |
| "sampling/importance_sampling_ratio/mean": 1.0152499675750732, | |
| "sampling/importance_sampling_ratio/min": 0.4359276592731476, | |
| "sampling/sampling_logp_difference/max": 0.4399428367614746, | |
| "sampling/sampling_logp_difference/mean": 0.028559193015098572, | |
| "step": 9, | |
| "step_time": 22.1539967839999 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04734848625957966, | |
| "clip_ratio/high_mean": 0.013139204937033355, | |
| "clip_ratio/low_mean": 0.007930871448479593, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021070076385512948, | |
| "entropy": 0.26416848599910736, | |
| "epoch": 0.0002, | |
| "grad_norm": 0.6573855876922607, | |
| "kl": 0.03966027498245239, | |
| "learning_rate": 2.5714285714285714e-05, | |
| "loss": 0.021, | |
| "step": 10, | |
| "step_time": 7.0971175310000945 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 822.0, | |
| "completions/max_terminated_length": 822.0, | |
| "completions/mean_length": 750.75, | |
| "completions/mean_terminated_length": 750.75, | |
| "completions/min_length": 608.0, | |
| "completions/min_terminated_length": 608.0, | |
| "entropy": 0.2343001812696457, | |
| "epoch": 0.00022, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.864137589931488, | |
| "kl": 0.03614223480690271, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": -0.018, | |
| "num_tokens": 619774.0, | |
| "reward": -1.144383430480957, | |
| "reward_std": 9.403154373168945, | |
| "rewards/rollout_reward_func/mean": -1.144383192062378, | |
| "rewards/rollout_reward_func/std": 10.208455085754395, | |
| "sampling/importance_sampling_ratio/max": 1.6737509965896606, | |
| "sampling/importance_sampling_ratio/mean": 1.0005735158920288, | |
| "sampling/importance_sampling_ratio/min": 0.5264889001846313, | |
| "sampling/sampling_logp_difference/max": 0.7381381988525391, | |
| "sampling/sampling_logp_difference/mean": 0.03099803999066353, | |
| "step": 11, | |
| "step_time": 24.1759815060002 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04261363763362169, | |
| "clip_ratio/high_mean": 0.011955492780543864, | |
| "clip_ratio/low_mean": 0.018229166977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.030184659990482032, | |
| "entropy": 0.23695118725299835, | |
| "epoch": 0.00024, | |
| "grad_norm": 0.5675711631774902, | |
| "kl": 0.05231437139445916, | |
| "learning_rate": 3.142857142857143e-05, | |
| "loss": -0.0247, | |
| "step": 12, | |
| "step_time": 7.241505567000331 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 766.03125, | |
| "completions/mean_terminated_length": 766.03125, | |
| "completions/min_length": 638.0, | |
| "completions/min_terminated_length": 638.0, | |
| "entropy": 0.23159058205783367, | |
| "epoch": 0.00026, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.560762345790863, | |
| "kl": 0.10253941919654608, | |
| "learning_rate": 3.428571428571429e-05, | |
| "loss": -0.0132, | |
| "num_tokens": 725286.0, | |
| "reward": 0.9126645922660828, | |
| "reward_std": 8.317488670349121, | |
| "rewards/rollout_reward_func/mean": 0.9126646518707275, | |
| "rewards/rollout_reward_func/std": 9.508187294006348, | |
| "sampling/importance_sampling_ratio/max": 1.4912891387939453, | |
| "sampling/importance_sampling_ratio/mean": 0.9157562255859375, | |
| "sampling/importance_sampling_ratio/min": 0.15846048295497894, | |
| "sampling/sampling_logp_difference/max": 0.9116353988647461, | |
| "sampling/sampling_logp_difference/mean": 0.03342486917972565, | |
| "step": 13, | |
| "step_time": 24.797564555999315 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.0052083334885537624, | |
| "clip_ratio/low_mean": 0.04107481171377003, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.046283145202323794, | |
| "entropy": 0.21706843469291925, | |
| "epoch": 0.00028, | |
| "grad_norm": 0.737306535243988, | |
| "kl": 0.20574123412370682, | |
| "learning_rate": 3.7142857142857143e-05, | |
| "loss": -0.0141, | |
| "step": 14, | |
| "step_time": 8.782559869000124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.002864583395421505, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002864583395421505, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 838.0, | |
| "completions/max_terminated_length": 838.0, | |
| "completions/mean_length": 730.265625, | |
| "completions/mean_terminated_length": 730.265625, | |
| "completions/min_length": 293.0, | |
| "completions/min_terminated_length": 293.0, | |
| "entropy": 0.20442467741668224, | |
| "epoch": 0.0003, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7728816866874695, | |
| "kl": 0.10564538510516286, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0314, | |
| "num_tokens": 827749.0, | |
| "reward": -1.6128692626953125, | |
| "reward_std": 6.231240272521973, | |
| "rewards/rollout_reward_func/mean": -1.6128690242767334, | |
| "rewards/rollout_reward_func/std": 6.545647621154785, | |
| "sampling/importance_sampling_ratio/max": 1.7540509700775146, | |
| "sampling/importance_sampling_ratio/mean": 1.0142356157302856, | |
| "sampling/importance_sampling_ratio/min": 0.45990973711013794, | |
| "sampling/sampling_logp_difference/max": 0.7248215675354004, | |
| "sampling/sampling_logp_difference/mean": 0.030622530728578568, | |
| "step": 15, | |
| "step_time": 24.38517581200017 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.047821971122175455, | |
| "clip_ratio/high_mean": 0.013257576036266983, | |
| "clip_ratio/low_mean": 0.025236743036657572, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0384943193057552, | |
| "entropy": 0.19760818500071764, | |
| "epoch": 0.00032, | |
| "grad_norm": 0.6611685752868652, | |
| "kl": 0.11387888877652586, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.0262, | |
| "step": 16, | |
| "step_time": 7.110903799999505 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 826.0, | |
| "completions/max_terminated_length": 826.0, | |
| "completions/mean_length": 760.359375, | |
| "completions/mean_terminated_length": 760.359375, | |
| "completions/min_length": 659.0, | |
| "completions/min_terminated_length": 659.0, | |
| "entropy": 0.19120646081864834, | |
| "epoch": 0.00034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.0170923471450806, | |
| "kl": 0.08101693401113153, | |
| "learning_rate": 4.5714285714285716e-05, | |
| "loss": -0.015, | |
| "num_tokens": 931841.0, | |
| "reward": -1.6879972219467163, | |
| "reward_std": 9.023077011108398, | |
| "rewards/rollout_reward_func/mean": -1.6879971027374268, | |
| "rewards/rollout_reward_func/std": 10.298378944396973, | |
| "sampling/importance_sampling_ratio/max": 2.430154800415039, | |
| "sampling/importance_sampling_ratio/mean": 1.065093755722046, | |
| "sampling/importance_sampling_ratio/min": 0.6535128951072693, | |
| "sampling/sampling_logp_difference/max": 0.7661471366882324, | |
| "sampling/sampling_logp_difference/mean": 0.024486079812049866, | |
| "step": 17, | |
| "step_time": 27.987481355 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.043560607358813286, | |
| "clip_ratio/high_mean": 0.016335227992385626, | |
| "clip_ratio/low_mean": 0.01846590987406671, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03480113763362169, | |
| "entropy": 0.19553834106773138, | |
| "epoch": 0.00036, | |
| "grad_norm": 0.5111234784126282, | |
| "kl": 0.088710677344352, | |
| "learning_rate": 4.8571428571428576e-05, | |
| "loss": -0.0206, | |
| "step": 18, | |
| "step_time": 7.182192339999801 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003906250116415322, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 816.0, | |
| "completions/max_terminated_length": 816.0, | |
| "completions/mean_length": 733.640625, | |
| "completions/mean_terminated_length": 733.640625, | |
| "completions/min_length": 296.0, | |
| "completions/min_terminated_length": 296.0, | |
| "entropy": 0.1935133864171803, | |
| "epoch": 0.00038, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7022229433059692, | |
| "kl": 0.1404350029770285, | |
| "learning_rate": 5.142857142857143e-05, | |
| "loss": -0.0003, | |
| "num_tokens": 1033723.0, | |
| "reward": -1.2022110223770142, | |
| "reward_std": 10.956363677978516, | |
| "rewards/rollout_reward_func/mean": -1.2022109031677246, | |
| "rewards/rollout_reward_func/std": 12.292625427246094, | |
| "sampling/importance_sampling_ratio/max": 1.6157236099243164, | |
| "sampling/importance_sampling_ratio/mean": 0.9594892263412476, | |
| "sampling/importance_sampling_ratio/min": 0.3754613697528839, | |
| "sampling/sampling_logp_difference/max": 0.9176025390625, | |
| "sampling/sampling_logp_difference/mean": 0.028035998344421387, | |
| "step": 19, | |
| "step_time": 27.688288005999993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04876894084736705, | |
| "clip_ratio/high_mean": 0.012192235211841762, | |
| "clip_ratio/low_mean": 0.018584280740469694, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0307765161851421, | |
| "entropy": 0.20130611211061478, | |
| "epoch": 0.0004, | |
| "grad_norm": 0.4695027768611908, | |
| "kl": 0.18750765593722463, | |
| "learning_rate": 5.428571428571428e-05, | |
| "loss": -0.0054, | |
| "step": 20, | |
| "step_time": 7.739605327000618 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0014204545877873898, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014204545877873898, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 737.9375, | |
| "completions/mean_terminated_length": 737.9375, | |
| "completions/min_length": 618.0, | |
| "completions/min_terminated_length": 618.0, | |
| "entropy": 0.18132759165018797, | |
| "epoch": 0.00042, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.1652212142944336, | |
| "kl": 0.13582510640844703, | |
| "learning_rate": 5.714285714285714e-05, | |
| "loss": 0.0262, | |
| "num_tokens": 1135968.0, | |
| "reward": -0.28913062810897827, | |
| "reward_std": 7.3008809089660645, | |
| "rewards/rollout_reward_func/mean": -0.28913065791130066, | |
| "rewards/rollout_reward_func/std": 7.988962650299072, | |
| "sampling/importance_sampling_ratio/max": 2.336996555328369, | |
| "sampling/importance_sampling_ratio/mean": 1.0362560749053955, | |
| "sampling/importance_sampling_ratio/min": 0.6398296356201172, | |
| "sampling/sampling_logp_difference/max": 0.6417920589447021, | |
| "sampling/sampling_logp_difference/mean": 0.022837379947304726, | |
| "step": 21, | |
| "step_time": 28.57662482000046 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.006510416860692203, | |
| "clip_ratio/low_mean": 0.025386679684743285, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03189709666185081, | |
| "entropy": 0.1763849752023816, | |
| "epoch": 0.00044, | |
| "grad_norm": 0.3849461078643799, | |
| "kl": 0.16632835287600756, | |
| "learning_rate": 6e-05, | |
| "loss": 0.0212, | |
| "step": 22, | |
| "step_time": 8.287740409000207 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0014204545877873898, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004024621332064271, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 826.0, | |
| "completions/max_terminated_length": 826.0, | |
| "completions/mean_length": 723.875, | |
| "completions/mean_terminated_length": 723.875, | |
| "completions/min_length": 305.0, | |
| "completions/min_terminated_length": 305.0, | |
| "entropy": 0.1840990763157606, | |
| "epoch": 0.00046, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.860249936580658, | |
| "kl": 0.25097968662157655, | |
| "learning_rate": 6.285714285714286e-05, | |
| "loss": 0.0286, | |
| "num_tokens": 1237057.0, | |
| "reward": 0.4839830696582794, | |
| "reward_std": 10.420938491821289, | |
| "rewards/rollout_reward_func/mean": 0.4839830994606018, | |
| "rewards/rollout_reward_func/std": 11.429144859313965, | |
| "sampling/importance_sampling_ratio/max": 2.106267213821411, | |
| "sampling/importance_sampling_ratio/mean": 1.0313048362731934, | |
| "sampling/importance_sampling_ratio/min": 0.574251651763916, | |
| "sampling/sampling_logp_difference/max": 0.8508915901184082, | |
| "sampling/sampling_logp_difference/mean": 0.02066868171095848, | |
| "step": 23, | |
| "step_time": 28.494462327999827 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06818181974813342, | |
| "clip_ratio/high_mean": 0.021070076152682304, | |
| "clip_ratio/low_mean": 0.018347538076341152, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03941761387977749, | |
| "entropy": 0.19043638091534376, | |
| "epoch": 0.00048, | |
| "grad_norm": 0.6448091864585876, | |
| "kl": 0.35418248968198895, | |
| "learning_rate": 6.571428571428571e-05, | |
| "loss": 0.0215, | |
| "step": 24, | |
| "step_time": 7.416647947999536 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.004024621332064271, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005326704704202712, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 829.0, | |
| "completions/max_terminated_length": 829.0, | |
| "completions/mean_length": 731.0625, | |
| "completions/mean_terminated_length": 731.0625, | |
| "completions/min_length": 615.0, | |
| "completions/min_terminated_length": 615.0, | |
| "entropy": 0.1908296812325716, | |
| "epoch": 0.0005, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.66495680809021, | |
| "kl": 0.21043909061700106, | |
| "learning_rate": 6.857142857142858e-05, | |
| "loss": -0.0275, | |
| "num_tokens": 1337760.0, | |
| "reward": 0.9224299788475037, | |
| "reward_std": 10.655890464782715, | |
| "rewards/rollout_reward_func/mean": 0.9224300384521484, | |
| "rewards/rollout_reward_func/std": 12.821269989013672, | |
| "sampling/importance_sampling_ratio/max": 1.5019664764404297, | |
| "sampling/importance_sampling_ratio/mean": 1.0262192487716675, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9519531726837158, | |
| "sampling/sampling_logp_difference/mean": 0.018259627744555473, | |
| "step": 25, | |
| "step_time": 29.797745564000707 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05823863809928298, | |
| "clip_ratio/high_mean": 0.017163826269097626, | |
| "clip_ratio/low_mean": 0.024147727992385626, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.041311554377898574, | |
| "entropy": 0.1913931304588914, | |
| "epoch": 0.00052, | |
| "grad_norm": 0.5575593709945679, | |
| "kl": 0.26408666698262095, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": -0.0322, | |
| "step": 26, | |
| "step_time": 7.109563219000847 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 731.640625, | |
| "completions/mean_terminated_length": 731.640625, | |
| "completions/min_length": 185.0, | |
| "completions/min_terminated_length": 185.0, | |
| "entropy": 0.19422233663499355, | |
| "epoch": 0.00054, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6210283637046814, | |
| "kl": 0.21635392913594842, | |
| "learning_rate": 7.428571428571429e-05, | |
| "loss": -0.0185, | |
| "num_tokens": 1439214.0, | |
| "reward": 0.326141357421875, | |
| "reward_std": 13.388666152954102, | |
| "rewards/rollout_reward_func/mean": 0.32614123821258545, | |
| "rewards/rollout_reward_func/std": 14.97364616394043, | |
| "sampling/importance_sampling_ratio/max": 1.5914506912231445, | |
| "sampling/importance_sampling_ratio/mean": 1.0221253633499146, | |
| "sampling/importance_sampling_ratio/min": 0.7667937874794006, | |
| "sampling/sampling_logp_difference/max": 0.37548696994781494, | |
| "sampling/sampling_logp_difference/mean": 0.012905368581414223, | |
| "step": 27, | |
| "step_time": 28.513997486000562 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05255681974813342, | |
| "clip_ratio/high_mean": 0.01574337179772556, | |
| "clip_ratio/low_mean": 0.01661931863054633, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.032362690777517855, | |
| "entropy": 0.1939925504848361, | |
| "epoch": 0.00056, | |
| "grad_norm": 0.2964678406715393, | |
| "kl": 0.22840850101783872, | |
| "learning_rate": 7.714285714285715e-05, | |
| "loss": -0.0252, | |
| "step": 28, | |
| "step_time": 8.46359607699992 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010890151839703321, | |
| "clip_ratio/high_mean": 0.0027225379599258304, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004024621448479593, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 834.0, | |
| "completions/max_terminated_length": 834.0, | |
| "completions/mean_length": 714.359375, | |
| "completions/mean_terminated_length": 714.359375, | |
| "completions/min_length": 503.0, | |
| "completions/min_terminated_length": 503.0, | |
| "entropy": 0.1717732958495617, | |
| "epoch": 0.00058, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3590966761112213, | |
| "kl": 0.24717363435775042, | |
| "learning_rate": 8e-05, | |
| "loss": 0.0036, | |
| "num_tokens": 1539055.0, | |
| "reward": 1.930895447731018, | |
| "reward_std": 8.148633003234863, | |
| "rewards/rollout_reward_func/mean": 1.930895447731018, | |
| "rewards/rollout_reward_func/std": 9.020356178283691, | |
| "sampling/importance_sampling_ratio/max": 1.6024476289749146, | |
| "sampling/importance_sampling_ratio/mean": 1.0161041021347046, | |
| "sampling/importance_sampling_ratio/min": 0.7807760238647461, | |
| "sampling/sampling_logp_difference/max": 0.35602256655693054, | |
| "sampling/sampling_logp_difference/mean": 0.011149970814585686, | |
| "step": 29, | |
| "step_time": 28.064759372000253 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027083334047347307, | |
| "clip_ratio/high_mean": 0.006770833511836827, | |
| "clip_ratio/low_mean": 0.029711175127886236, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03648200852330774, | |
| "entropy": 0.16414203867316246, | |
| "epoch": 0.0006, | |
| "grad_norm": 0.38951048254966736, | |
| "kl": 0.28005583630874753, | |
| "learning_rate": 8.285714285714287e-05, | |
| "loss": 0.0013, | |
| "step": 30, | |
| "step_time": 7.401456857000312 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 809.0, | |
| "completions/max_terminated_length": 809.0, | |
| "completions/mean_length": 708.046875, | |
| "completions/mean_terminated_length": 708.046875, | |
| "completions/min_length": 475.0, | |
| "completions/min_terminated_length": 475.0, | |
| "entropy": 0.16439654119312763, | |
| "epoch": 0.00062, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5445168614387512, | |
| "kl": 0.2800124539062381, | |
| "learning_rate": 8.571428571428571e-05, | |
| "loss": 0.0097, | |
| "num_tokens": 1638113.0, | |
| "reward": 0.29781579971313477, | |
| "reward_std": 10.009416580200195, | |
| "rewards/rollout_reward_func/mean": 0.29781582951545715, | |
| "rewards/rollout_reward_func/std": 11.176705360412598, | |
| "sampling/importance_sampling_ratio/max": 1.755067229270935, | |
| "sampling/importance_sampling_ratio/mean": 1.0180511474609375, | |
| "sampling/importance_sampling_ratio/min": 0.580125629901886, | |
| "sampling/sampling_logp_difference/max": 0.5197739601135254, | |
| "sampling/sampling_logp_difference/mean": 0.013791397213935852, | |
| "step": 31, | |
| "step_time": 30.773730244999797 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833395421505, | |
| "clip_ratio/high_mean": 0.013139204704202712, | |
| "clip_ratio/low_mean": 0.03042140242177993, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04356060677673668, | |
| "entropy": 0.15187342395074666, | |
| "epoch": 0.00064, | |
| "grad_norm": 0.30164626240730286, | |
| "kl": 0.32055927254259586, | |
| "learning_rate": 8.857142857142857e-05, | |
| "loss": 0.0037, | |
| "step": 32, | |
| "step_time": 7.328695028999618 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0027225379599258304, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004024621332064271, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 835.0, | |
| "completions/max_terminated_length": 835.0, | |
| "completions/mean_length": 701.5, | |
| "completions/mean_terminated_length": 701.5, | |
| "completions/min_length": 502.0, | |
| "completions/min_terminated_length": 502.0, | |
| "entropy": 0.1332990936934948, | |
| "epoch": 0.00066, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.43104609847068787, | |
| "kl": 0.32164820563048124, | |
| "learning_rate": 9.142857142857143e-05, | |
| "loss": 0.0025, | |
| "num_tokens": 1738075.0, | |
| "reward": 3.1038765907287598, | |
| "reward_std": 11.951395988464355, | |
| "rewards/rollout_reward_func/mean": 3.1038765907287598, | |
| "rewards/rollout_reward_func/std": 12.847871780395508, | |
| "sampling/importance_sampling_ratio/max": 1.3508435487747192, | |
| "sampling/importance_sampling_ratio/mean": 0.9952214360237122, | |
| "sampling/importance_sampling_ratio/min": 0.6407750844955444, | |
| "sampling/sampling_logp_difference/max": 0.47523796558380127, | |
| "sampling/sampling_logp_difference/mean": 0.013571259565651417, | |
| "step": 33, | |
| "step_time": 27.829260915000077 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03219697065651417, | |
| "clip_ratio/high_mean": 0.010653409408405423, | |
| "clip_ratio/low_mean": 0.029000947601161897, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03965435700956732, | |
| "entropy": 0.12380115175619721, | |
| "epoch": 0.00068, | |
| "grad_norm": 0.27367016673088074, | |
| "kl": 0.423783166334033, | |
| "learning_rate": 9.428571428571429e-05, | |
| "loss": -0.0, | |
| "step": 34, | |
| "step_time": 7.799126809999507 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 819.0, | |
| "completions/max_terminated_length": 819.0, | |
| "completions/mean_length": 687.4375, | |
| "completions/mean_terminated_length": 687.4375, | |
| "completions/min_length": 618.0, | |
| "completions/min_terminated_length": 618.0, | |
| "entropy": 0.10798696288838983, | |
| "epoch": 0.0007, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4482150673866272, | |
| "kl": 0.3214763030409813, | |
| "learning_rate": 9.714285714285715e-05, | |
| "loss": 0.028, | |
| "num_tokens": 1836043.0, | |
| "reward": 3.037400960922241, | |
| "reward_std": 12.985002517700195, | |
| "rewards/rollout_reward_func/mean": 3.037400960922241, | |
| "rewards/rollout_reward_func/std": 13.425616264343262, | |
| "sampling/importance_sampling_ratio/max": 1.4862518310546875, | |
| "sampling/importance_sampling_ratio/mean": 1.0146703720092773, | |
| "sampling/importance_sampling_ratio/min": 0.5140225291252136, | |
| "sampling/sampling_logp_difference/max": 0.8002816438674927, | |
| "sampling/sampling_logp_difference/mean": 0.01363956555724144, | |
| "step": 35, | |
| "step_time": 28.662696071000028 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.042140152771025896, | |
| "clip_ratio/high_mean": 0.010535038192756474, | |
| "clip_ratio/low_mean": 0.014441288309171796, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.024976326269097626, | |
| "entropy": 0.1118780323304236, | |
| "epoch": 0.00072, | |
| "grad_norm": 0.1983855962753296, | |
| "kl": 0.373223016038537, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0232, | |
| "step": 36, | |
| "step_time": 8.269840026000338 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 811.0, | |
| "completions/max_terminated_length": 811.0, | |
| "completions/mean_length": 679.3125, | |
| "completions/mean_terminated_length": 679.3125, | |
| "completions/min_length": 449.0, | |
| "completions/min_terminated_length": 449.0, | |
| "entropy": 0.12342227855697274, | |
| "epoch": 0.00074, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6195780634880066, | |
| "kl": 0.45714515913277864, | |
| "learning_rate": 9.999999998148153e-05, | |
| "loss": -0.0249, | |
| "num_tokens": 1932947.0, | |
| "reward": 3.72019362449646, | |
| "reward_std": 11.354637145996094, | |
| "rewards/rollout_reward_func/mean": 3.720193862915039, | |
| "rewards/rollout_reward_func/std": 11.66490650177002, | |
| "sampling/importance_sampling_ratio/max": 2.1260557174682617, | |
| "sampling/importance_sampling_ratio/mean": 1.049971580505371, | |
| "sampling/importance_sampling_ratio/min": 0.6164436340332031, | |
| "sampling/sampling_logp_difference/max": 0.5450749397277832, | |
| "sampling/sampling_logp_difference/mean": 0.01501537300646305, | |
| "step": 37, | |
| "step_time": 27.480367904999866 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05303030414506793, | |
| "clip_ratio/high_mean": 0.014678030624054372, | |
| "clip_ratio/low_mean": 0.014322917209938169, | |
| "clip_ratio/low_min": 0.0052083334885537624, | |
| "clip_ratio/region_mean": 0.029000947950407863, | |
| "entropy": 0.13006606698036194, | |
| "epoch": 0.00076, | |
| "grad_norm": 0.2681926488876343, | |
| "kl": 0.4847450293600559, | |
| "learning_rate": 9.999999992592612e-05, | |
| "loss": -0.0318, | |
| "step": 38, | |
| "step_time": 7.225284665000345 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 700.09375, | |
| "completions/mean_terminated_length": 700.09375, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "entropy": 0.15452369069680572, | |
| "epoch": 0.00078, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4834868311882019, | |
| "kl": 0.4672291334718466, | |
| "learning_rate": 9.999999983333379e-05, | |
| "loss": -0.0162, | |
| "num_tokens": 2032280.0, | |
| "reward": 5.62964391708374, | |
| "reward_std": 9.88559341430664, | |
| "rewards/rollout_reward_func/mean": 5.629644393920898, | |
| "rewards/rollout_reward_func/std": 12.693258285522461, | |
| "sampling/importance_sampling_ratio/max": 1.5066994428634644, | |
| "sampling/importance_sampling_ratio/mean": 1.0094711780548096, | |
| "sampling/importance_sampling_ratio/min": 0.6512829065322876, | |
| "sampling/sampling_logp_difference/max": 0.4918508529663086, | |
| "sampling/sampling_logp_difference/mean": 0.01460680365562439, | |
| "step": 39, | |
| "step_time": 30.803230847000123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05823863809928298, | |
| "clip_ratio/high_mean": 0.01976799312978983, | |
| "clip_ratio/low_mean": 0.02734375069849193, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04711174394469708, | |
| "entropy": 0.14933442790061235, | |
| "epoch": 0.0008, | |
| "grad_norm": 0.34873443841934204, | |
| "kl": 0.5781354140490294, | |
| "learning_rate": 9.99999997037045e-05, | |
| "loss": -0.0203, | |
| "step": 40, | |
| "step_time": 7.3111222899999575 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0028409091755747795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 829.0, | |
| "completions/max_terminated_length": 829.0, | |
| "completions/mean_length": 686.59375, | |
| "completions/mean_terminated_length": 686.59375, | |
| "completions/min_length": 615.0, | |
| "completions/min_terminated_length": 615.0, | |
| "entropy": 0.15176831698045135, | |
| "epoch": 0.00082, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4504550099372864, | |
| "kl": 0.6600655419752002, | |
| "learning_rate": 9.999999953703829e-05, | |
| "loss": -0.0185, | |
| "num_tokens": 2130497.0, | |
| "reward": 2.0073609352111816, | |
| "reward_std": 8.8825044631958, | |
| "rewards/rollout_reward_func/mean": 2.0073609352111816, | |
| "rewards/rollout_reward_func/std": 9.321340560913086, | |
| "sampling/importance_sampling_ratio/max": 1.5246989727020264, | |
| "sampling/importance_sampling_ratio/mean": 1.0359078645706177, | |
| "sampling/importance_sampling_ratio/min": 0.3844473361968994, | |
| "sampling/sampling_logp_difference/max": 0.955810546875, | |
| "sampling/sampling_logp_difference/mean": 0.012838078662753105, | |
| "step": 41, | |
| "step_time": 28.587795755999878 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03172348579391837, | |
| "clip_ratio/high_mean": 0.009232954820618033, | |
| "clip_ratio/low_mean": 0.022608901956118643, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03184185642749071, | |
| "entropy": 0.14899979438632727, | |
| "epoch": 0.00084, | |
| "grad_norm": 2.304894208908081, | |
| "kl": 1.5326191950589418, | |
| "learning_rate": 9.999999933333512e-05, | |
| "loss": -0.0201, | |
| "step": 42, | |
| "step_time": 8.04831712999976 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0014204545877873898, | |
| "clip_ratio/low_mean": 0.0026041667442768812, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004024621332064271, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 825.0, | |
| "completions/max_terminated_length": 825.0, | |
| "completions/mean_length": 685.40625, | |
| "completions/mean_terminated_length": 685.40625, | |
| "completions/min_length": 389.0, | |
| "completions/min_terminated_length": 389.0, | |
| "entropy": 0.1393027831800282, | |
| "epoch": 0.00086, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5649179816246033, | |
| "kl": 0.7229090742766857, | |
| "learning_rate": 9.999999909259503e-05, | |
| "loss": -0.017, | |
| "num_tokens": 2228288.0, | |
| "reward": 1.6912901401519775, | |
| "reward_std": 10.596427917480469, | |
| "rewards/rollout_reward_func/mean": 1.691290020942688, | |
| "rewards/rollout_reward_func/std": 12.0145263671875, | |
| "sampling/importance_sampling_ratio/max": 1.3425889015197754, | |
| "sampling/importance_sampling_ratio/mean": 0.9553788304328918, | |
| "sampling/importance_sampling_ratio/min": 0.5974801778793335, | |
| "sampling/sampling_logp_difference/max": 0.34511590003967285, | |
| "sampling/sampling_logp_difference/mean": 0.01251951139420271, | |
| "step": 43, | |
| "step_time": 27.475775691999615 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04829545598477125, | |
| "clip_ratio/high_mean": 0.015980114112608135, | |
| "clip_ratio/low_mean": 0.024053030996583402, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.040033145574852824, | |
| "entropy": 0.14617095375433564, | |
| "epoch": 0.00088, | |
| "grad_norm": 0.3445337116718292, | |
| "kl": 0.5654929745942354, | |
| "learning_rate": 9.9999998814818e-05, | |
| "loss": -0.023, | |
| "step": 44, | |
| "step_time": 7.598477493999553 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 831.0, | |
| "completions/max_terminated_length": 831.0, | |
| "completions/mean_length": 713.671875, | |
| "completions/mean_terminated_length": 713.671875, | |
| "completions/min_length": 656.0, | |
| "completions/min_terminated_length": 656.0, | |
| "entropy": 0.14390681218355894, | |
| "epoch": 0.0009, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.45263612270355225, | |
| "kl": 0.6904484182596207, | |
| "learning_rate": 9.999999850000404e-05, | |
| "loss": -0.005, | |
| "num_tokens": 2328132.0, | |
| "reward": 2.4324169158935547, | |
| "reward_std": 13.961143493652344, | |
| "rewards/rollout_reward_func/mean": 2.4324169158935547, | |
| "rewards/rollout_reward_func/std": 14.438629150390625, | |
| "sampling/importance_sampling_ratio/max": 1.3720179796218872, | |
| "sampling/importance_sampling_ratio/mean": 1.00229012966156, | |
| "sampling/importance_sampling_ratio/min": 0.6608520746231079, | |
| "sampling/sampling_logp_difference/max": 0.301973819732666, | |
| "sampling/sampling_logp_difference/mean": 0.010271631181240082, | |
| "step": 45, | |
| "step_time": 28.995988180999802 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026041667442768812, | |
| "clip_ratio/high_mean": 0.006510416860692203, | |
| "clip_ratio/low_mean": 0.02043087175115943, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026941288728266954, | |
| "entropy": 0.1375666274689138, | |
| "epoch": 0.00092, | |
| "grad_norm": 0.3008887469768524, | |
| "kl": 0.6632084101438522, | |
| "learning_rate": 9.999999814815312e-05, | |
| "loss": -0.0106, | |
| "step": 46, | |
| "step_time": 7.42895066499932 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0027225379599258304, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005326704704202712, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 698.640625, | |
| "completions/mean_terminated_length": 698.640625, | |
| "completions/min_length": 393.0, | |
| "completions/min_terminated_length": 393.0, | |
| "entropy": 0.14624580927193165, | |
| "epoch": 0.00094, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.36133161187171936, | |
| "kl": 0.5184649843722582, | |
| "learning_rate": 9.99999977592653e-05, | |
| "loss": -0.0129, | |
| "num_tokens": 2426521.0, | |
| "reward": 1.375571846961975, | |
| "reward_std": 11.66879940032959, | |
| "rewards/rollout_reward_func/mean": 1.3755717277526855, | |
| "rewards/rollout_reward_func/std": 11.796045303344727, | |
| "sampling/importance_sampling_ratio/max": 1.8656487464904785, | |
| "sampling/importance_sampling_ratio/mean": 1.0228910446166992, | |
| "sampling/importance_sampling_ratio/min": 0.505867063999176, | |
| "sampling/sampling_logp_difference/max": 0.6223084926605225, | |
| "sampling/sampling_logp_difference/mean": 0.011709067039191723, | |
| "step": 47, | |
| "step_time": 29.763493531999984 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03172348579391837, | |
| "clip_ratio/high_mean": 0.007930871448479593, | |
| "clip_ratio/low_mean": 0.02568655402865261, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03361742536071688, | |
| "entropy": 0.14968854701146483, | |
| "epoch": 0.00096, | |
| "grad_norm": 0.17635680735111237, | |
| "kl": 0.5038973540067673, | |
| "learning_rate": 9.999999733334051e-05, | |
| "loss": -0.0167, | |
| "step": 48, | |
| "step_time": 7.652514348999603 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0027225379599258304, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0027225379599258304, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 818.0, | |
| "completions/max_terminated_length": 818.0, | |
| "completions/mean_length": 704.453125, | |
| "completions/mean_terminated_length": 704.453125, | |
| "completions/min_length": 635.0, | |
| "completions/min_terminated_length": 635.0, | |
| "entropy": 0.14840606460347772, | |
| "epoch": 0.00098, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5855311751365662, | |
| "kl": 0.5907826572656631, | |
| "learning_rate": 9.99999968703788e-05, | |
| "loss": 0.0381, | |
| "num_tokens": 2526069.0, | |
| "reward": 4.523091793060303, | |
| "reward_std": 11.536006927490234, | |
| "rewards/rollout_reward_func/mean": 4.523091793060303, | |
| "rewards/rollout_reward_func/std": 12.290811538696289, | |
| "sampling/importance_sampling_ratio/max": 2.122157573699951, | |
| "sampling/importance_sampling_ratio/mean": 1.0083321332931519, | |
| "sampling/importance_sampling_ratio/min": 0.6556381583213806, | |
| "sampling/sampling_logp_difference/max": 0.5623667240142822, | |
| "sampling/sampling_logp_difference/mean": 0.012646196410059929, | |
| "step": 49, | |
| "step_time": 27.48595007899985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05445075919851661, | |
| "clip_ratio/high_mean": 0.017518940148875117, | |
| "clip_ratio/low_mean": 0.03401988744735718, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.051538827014155686, | |
| "entropy": 0.14183657616376877, | |
| "epoch": 0.001, | |
| "grad_norm": 0.3584051728248596, | |
| "kl": 0.5096510350704193, | |
| "learning_rate": 9.999999637038015e-05, | |
| "loss": 0.0365, | |
| "step": 50, | |
| "step_time": 9.165422230000104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0027225379599258304, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0027225379599258304, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 818.0, | |
| "completions/max_terminated_length": 818.0, | |
| "completions/mean_length": 692.375, | |
| "completions/mean_terminated_length": 692.375, | |
| "completions/min_length": 290.0, | |
| "completions/min_terminated_length": 290.0, | |
| "entropy": 0.14247119799256325, | |
| "epoch": 0.00102, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.49646589159965515, | |
| "kl": 0.4570716666057706, | |
| "learning_rate": 9.999999583334457e-05, | |
| "loss": -0.0101, | |
| "num_tokens": 2623145.0, | |
| "reward": 4.133634567260742, | |
| "reward_std": 10.326797485351562, | |
| "rewards/rollout_reward_func/mean": 4.133634567260742, | |
| "rewards/rollout_reward_func/std": 10.82159423828125, | |
| "sampling/importance_sampling_ratio/max": 1.6070019006729126, | |
| "sampling/importance_sampling_ratio/mean": 0.996033787727356, | |
| "sampling/importance_sampling_ratio/min": 0.5886021852493286, | |
| "sampling/sampling_logp_difference/max": 0.543494701385498, | |
| "sampling/sampling_logp_difference/mean": 0.010751021094620228, | |
| "step": 51, | |
| "step_time": 28.26161867099927 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04829545598477125, | |
| "clip_ratio/high_mean": 0.013375947251915932, | |
| "clip_ratio/low_mean": 0.02781723579391837, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0411931830458343, | |
| "entropy": 0.13112169969826937, | |
| "epoch": 0.00104, | |
| "grad_norm": 0.34045207500457764, | |
| "kl": 0.5393304694443941, | |
| "learning_rate": 9.999999525927207e-05, | |
| "loss": -0.016, | |
| "step": 52, | |
| "step_time": 6.901260032999289 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 825.0, | |
| "completions/max_terminated_length": 825.0, | |
| "completions/mean_length": 691.875, | |
| "completions/mean_terminated_length": 691.875, | |
| "completions/min_length": 462.0, | |
| "completions/min_terminated_length": 462.0, | |
| "entropy": 0.11744047561660409, | |
| "epoch": 0.00106, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3921552300453186, | |
| "kl": 0.42071591690182686, | |
| "learning_rate": 9.999999464816261e-05, | |
| "loss": 0.0037, | |
| "num_tokens": 2721107.0, | |
| "reward": 4.605119705200195, | |
| "reward_std": 12.441184997558594, | |
| "rewards/rollout_reward_func/mean": 4.605119228363037, | |
| "rewards/rollout_reward_func/std": 14.067066192626953, | |
| "sampling/importance_sampling_ratio/max": 1.3290151357650757, | |
| "sampling/importance_sampling_ratio/mean": 0.9739052057266235, | |
| "sampling/importance_sampling_ratio/min": 0.38011765480041504, | |
| "sampling/sampling_logp_difference/max": 0.929356575012207, | |
| "sampling/sampling_logp_difference/mean": 0.010732135735452175, | |
| "step": 53, | |
| "step_time": 30.069013398000834 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02651515230536461, | |
| "clip_ratio/high_mean": 0.006628788076341152, | |
| "clip_ratio/low_mean": 0.022904830053448677, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.029533618362620473, | |
| "entropy": 0.10777218686416745, | |
| "epoch": 0.00108, | |
| "grad_norm": 0.23905742168426514, | |
| "kl": 0.5194222312420607, | |
| "learning_rate": 9.999999400001624e-05, | |
| "loss": 0.002, | |
| "step": 54, | |
| "step_time": 7.081706939000014 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0014204545877873898, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0027225379599258304, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 824.0, | |
| "completions/max_terminated_length": 824.0, | |
| "completions/mean_length": 702.796875, | |
| "completions/mean_terminated_length": 702.796875, | |
| "completions/min_length": 614.0, | |
| "completions/min_terminated_length": 614.0, | |
| "entropy": 0.11523706745356321, | |
| "epoch": 0.0011, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.500625491142273, | |
| "kl": 0.5581346470862627, | |
| "learning_rate": 9.999999331483292e-05, | |
| "loss": -0.0203, | |
| "num_tokens": 2818643.0, | |
| "reward": 3.496170997619629, | |
| "reward_std": 14.47857666015625, | |
| "rewards/rollout_reward_func/mean": 3.496170997619629, | |
| "rewards/rollout_reward_func/std": 14.920737266540527, | |
| "sampling/importance_sampling_ratio/max": 1.5530641078948975, | |
| "sampling/importance_sampling_ratio/mean": 1.0201001167297363, | |
| "sampling/importance_sampling_ratio/min": 0.5336768627166748, | |
| "sampling/sampling_logp_difference/max": 0.6660118103027344, | |
| "sampling/sampling_logp_difference/mean": 0.013495232909917831, | |
| "step": 55, | |
| "step_time": 28.81458637300034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.036931819282472134, | |
| "clip_ratio/high_mean": 0.011837121681310236, | |
| "clip_ratio/low_mean": 0.02758049312978983, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03941761504393071, | |
| "entropy": 0.10888301394879818, | |
| "epoch": 0.00112, | |
| "grad_norm": 0.29490140080451965, | |
| "kl": 0.5603756010532379, | |
| "learning_rate": 9.999999259261268e-05, | |
| "loss": -0.0253, | |
| "step": 56, | |
| "step_time": 8.193311973000164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0026041667442768812, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 693.90625, | |
| "completions/mean_terminated_length": 693.90625, | |
| "completions/min_length": 494.0, | |
| "completions/min_terminated_length": 494.0, | |
| "entropy": 0.12224696017801762, | |
| "epoch": 0.00114, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4371468722820282, | |
| "kl": 0.5304271820932627, | |
| "learning_rate": 9.99999918333555e-05, | |
| "loss": 0.0189, | |
| "num_tokens": 2916279.0, | |
| "reward": 3.36903715133667, | |
| "reward_std": 12.011173248291016, | |
| "rewards/rollout_reward_func/mean": 3.369036912918091, | |
| "rewards/rollout_reward_func/std": 12.399989128112793, | |
| "sampling/importance_sampling_ratio/max": 1.8561766147613525, | |
| "sampling/importance_sampling_ratio/mean": 1.0033948421478271, | |
| "sampling/importance_sampling_ratio/min": 0.3815801441669464, | |
| "sampling/sampling_logp_difference/max": 0.957763671875, | |
| "sampling/sampling_logp_difference/mean": 0.011768012307584286, | |
| "step": 57, | |
| "step_time": 28.027659202000223 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05255681974813342, | |
| "clip_ratio/high_mean": 0.015743371564894915, | |
| "clip_ratio/low_mean": 0.019767993013374507, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0355113644618541, | |
| "entropy": 0.12679382599890232, | |
| "epoch": 0.00116, | |
| "grad_norm": 0.3022422790527344, | |
| "kl": 0.5225307196378708, | |
| "learning_rate": 9.999999103706142e-05, | |
| "loss": 0.015, | |
| "step": 58, | |
| "step_time": 8.72335070799977 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0026041667442768812, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003906250116415322, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 844.0, | |
| "completions/max_terminated_length": 844.0, | |
| "completions/mean_length": 681.265625, | |
| "completions/mean_terminated_length": 681.265625, | |
| "completions/min_length": 373.0, | |
| "completions/min_terminated_length": 373.0, | |
| "entropy": 0.12399047752842307, | |
| "epoch": 0.00118, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6583297848701477, | |
| "kl": 0.5364211667329073, | |
| "learning_rate": 9.999999020373037e-05, | |
| "loss": 0.0117, | |
| "num_tokens": 3012934.0, | |
| "reward": 2.8170366287231445, | |
| "reward_std": 12.926514625549316, | |
| "rewards/rollout_reward_func/mean": 2.8170366287231445, | |
| "rewards/rollout_reward_func/std": 13.227665901184082, | |
| "sampling/importance_sampling_ratio/max": 2.4036612510681152, | |
| "sampling/importance_sampling_ratio/mean": 0.9975829720497131, | |
| "sampling/importance_sampling_ratio/min": 0.6259334683418274, | |
| "sampling/sampling_logp_difference/max": 0.720775842666626, | |
| "sampling/sampling_logp_difference/mean": 0.010457618162035942, | |
| "step": 59, | |
| "step_time": 28.9596313010004 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.011718750349245965, | |
| "clip_ratio/low_mean": 0.03385416732635349, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04557291779201478, | |
| "entropy": 0.11637644609436393, | |
| "epoch": 0.0012, | |
| "grad_norm": 1.9307663440704346, | |
| "kl": 1.8184253200888634, | |
| "learning_rate": 9.999998933336241e-05, | |
| "loss": 0.0213, | |
| "step": 60, | |
| "step_time": 7.307322721999981 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0014204545877873898, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014204545877873898, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 816.0, | |
| "completions/max_terminated_length": 816.0, | |
| "completions/mean_length": 685.78125, | |
| "completions/mean_terminated_length": 685.78125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "entropy": 0.11613691644743085, | |
| "epoch": 0.00122, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.44358986616134644, | |
| "kl": 0.5193471424281597, | |
| "learning_rate": 9.999998842595753e-05, | |
| "loss": -0.0024, | |
| "num_tokens": 3109806.0, | |
| "reward": 4.651793479919434, | |
| "reward_std": 12.063810348510742, | |
| "rewards/rollout_reward_func/mean": 4.651793479919434, | |
| "rewards/rollout_reward_func/std": 12.754688262939453, | |
| "sampling/importance_sampling_ratio/max": 1.6620776653289795, | |
| "sampling/importance_sampling_ratio/mean": 0.9981948137283325, | |
| "sampling/importance_sampling_ratio/min": 0.6313586831092834, | |
| "sampling/sampling_logp_difference/max": 0.4394187927246094, | |
| "sampling/sampling_logp_difference/mean": 0.009169764816761017, | |
| "step": 61, | |
| "step_time": 30.805930039000714 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026988637167960405, | |
| "clip_ratio/high_mean": 0.01065340917557478, | |
| "clip_ratio/low_mean": 0.020951705169864, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03160511434543878, | |
| "entropy": 0.12027787417173386, | |
| "epoch": 0.00124, | |
| "grad_norm": 0.3839333951473236, | |
| "kl": 0.5386558780446649, | |
| "learning_rate": 9.999998748151572e-05, | |
| "loss": -0.0001, | |
| "step": 62, | |
| "step_time": 7.061834261000513 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 807.0, | |
| "completions/max_terminated_length": 807.0, | |
| "completions/mean_length": 694.46875, | |
| "completions/mean_terminated_length": 694.46875, | |
| "completions/min_length": 465.0, | |
| "completions/min_terminated_length": 465.0, | |
| "entropy": 0.13281571818515658, | |
| "epoch": 0.00126, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.47250673174858093, | |
| "kl": 0.5621049534529448, | |
| "learning_rate": 9.999998650003696e-05, | |
| "loss": -0.0068, | |
| "num_tokens": 3207160.0, | |
| "reward": 4.072500705718994, | |
| "reward_std": 12.934675216674805, | |
| "rewards/rollout_reward_func/mean": 4.072500705718994, | |
| "rewards/rollout_reward_func/std": 13.5437650680542, | |
| "sampling/importance_sampling_ratio/max": 1.4505815505981445, | |
| "sampling/importance_sampling_ratio/mean": 1.0127054452896118, | |
| "sampling/importance_sampling_ratio/min": 0.644386887550354, | |
| "sampling/sampling_logp_difference/max": 0.46297478675842285, | |
| "sampling/sampling_logp_difference/mean": 0.01112096942961216, | |
| "step": 63, | |
| "step_time": 27.765410665000445 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833441987634, | |
| "clip_ratio/high_mean": 0.010416666977107525, | |
| "clip_ratio/low_mean": 0.025236743153072894, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03565340966451913, | |
| "entropy": 0.11304981098510325, | |
| "epoch": 0.00128, | |
| "grad_norm": 0.23461361229419708, | |
| "kl": 0.707372922450304, | |
| "learning_rate": 9.999998548152131e-05, | |
| "loss": -0.0107, | |
| "step": 64, | |
| "step_time": 9.65409977299987 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 715.78125, | |
| "completions/mean_terminated_length": 715.78125, | |
| "completions/min_length": 618.0, | |
| "completions/min_terminated_length": 618.0, | |
| "entropy": 0.11744949175044894, | |
| "epoch": 0.0013, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.5356135368347168, | |
| "kl": 2.483667228370905, | |
| "learning_rate": 9.999998442596872e-05, | |
| "loss": 0.0155, | |
| "num_tokens": 3305784.0, | |
| "reward": 3.657202959060669, | |
| "reward_std": 10.959955215454102, | |
| "rewards/rollout_reward_func/mean": 3.657203197479248, | |
| "rewards/rollout_reward_func/std": 12.17599105834961, | |
| "sampling/importance_sampling_ratio/max": 1.36454439163208, | |
| "sampling/importance_sampling_ratio/mean": 1.0064573287963867, | |
| "sampling/importance_sampling_ratio/min": 0.6259024739265442, | |
| "sampling/sampling_logp_difference/max": 0.4463231563568115, | |
| "sampling/sampling_logp_difference/mean": 0.008584607392549515, | |
| "step": 65, | |
| "step_time": 29.066453170999694 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.031250000931322575, | |
| "clip_ratio/high_mean": 0.010416666860692203, | |
| "clip_ratio/low_mean": 0.02178030402865261, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.032196971122175455, | |
| "entropy": 0.14252985129132867, | |
| "epoch": 0.00132, | |
| "grad_norm": 0.2563531696796417, | |
| "kl": 0.6245546955615282, | |
| "learning_rate": 9.999998333337922e-05, | |
| "loss": -0.0004, | |
| "step": 66, | |
| "step_time": 8.02741467600049 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0014204545877873898, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014204545877873898, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 833.0, | |
| "completions/max_terminated_length": 833.0, | |
| "completions/mean_length": 678.890625, | |
| "completions/mean_terminated_length": 678.890625, | |
| "completions/min_length": 413.0, | |
| "completions/min_terminated_length": 413.0, | |
| "entropy": 0.15250376611948013, | |
| "epoch": 0.00134, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5203387141227722, | |
| "kl": 0.695558762177825, | |
| "learning_rate": 9.999998220375278e-05, | |
| "loss": -0.0145, | |
| "num_tokens": 3401864.0, | |
| "reward": 1.054423451423645, | |
| "reward_std": 11.31953239440918, | |
| "rewards/rollout_reward_func/mean": 1.054423451423645, | |
| "rewards/rollout_reward_func/std": 12.172701835632324, | |
| "sampling/importance_sampling_ratio/max": 1.2194569110870361, | |
| "sampling/importance_sampling_ratio/mean": 0.9876125454902649, | |
| "sampling/importance_sampling_ratio/min": 0.550414502620697, | |
| "sampling/sampling_logp_difference/max": 0.5297477841377258, | |
| "sampling/sampling_logp_difference/mean": 0.008570928126573563, | |
| "step": 67, | |
| "step_time": 30.060426205000795 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05965909268707037, | |
| "clip_ratio/high_mean": 0.018939394736662507, | |
| "clip_ratio/low_mean": 0.03338068269658834, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05232007708400488, | |
| "entropy": 0.17216729745268822, | |
| "epoch": 0.00136, | |
| "grad_norm": 0.27647653222084045, | |
| "kl": 0.6303851045668125, | |
| "learning_rate": 9.999998103708944e-05, | |
| "loss": -0.0169, | |
| "step": 68, | |
| "step_time": 7.55158718300072 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0015625000232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0015625000232830644, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 818.0, | |
| "completions/max_terminated_length": 818.0, | |
| "completions/mean_length": 692.171875, | |
| "completions/mean_terminated_length": 692.171875, | |
| "completions/min_length": 387.0, | |
| "completions/min_terminated_length": 387.0, | |
| "entropy": 0.17608004808425903, | |
| "epoch": 0.00138, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.40714362263679504, | |
| "kl": 0.5837149824947119, | |
| "learning_rate": 9.999997983338918e-05, | |
| "loss": 0.0075, | |
| "num_tokens": 3498494.0, | |
| "reward": 4.154041290283203, | |
| "reward_std": 15.997432708740234, | |
| "rewards/rollout_reward_func/mean": 4.154041290283203, | |
| "rewards/rollout_reward_func/std": 18.081926345825195, | |
| "sampling/importance_sampling_ratio/max": 1.240838885307312, | |
| "sampling/importance_sampling_ratio/mean": 0.9964578747749329, | |
| "sampling/importance_sampling_ratio/min": 0.756720781326294, | |
| "sampling/sampling_logp_difference/max": 0.326712965965271, | |
| "sampling/sampling_logp_difference/mean": 0.009719014167785645, | |
| "step": 69, | |
| "step_time": 28.760111235000295 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04450757708400488, | |
| "clip_ratio/high_mean": 0.013731061248108745, | |
| "clip_ratio/low_mean": 0.016698232851922512, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.030429294100031257, | |
| "entropy": 0.1826375536620617, | |
| "epoch": 0.0014, | |
| "grad_norm": 0.4711000919342041, | |
| "kl": 0.5743975602090359, | |
| "learning_rate": 9.999997859265198e-05, | |
| "loss": 0.0045, | |
| "step": 70, | |
| "step_time": 8.15129353600014 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 835.0, | |
| "completions/max_terminated_length": 835.0, | |
| "completions/mean_length": 695.796875, | |
| "completions/mean_terminated_length": 695.796875, | |
| "completions/min_length": 208.0, | |
| "completions/min_terminated_length": 208.0, | |
| "entropy": 0.20647307951003313, | |
| "epoch": 0.00142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3142479956150055, | |
| "kl": 0.5285101179033518, | |
| "learning_rate": 9.999997731487787e-05, | |
| "loss": -0.0177, | |
| "num_tokens": 3595387.0, | |
| "reward": 2.402292490005493, | |
| "reward_std": 13.013188362121582, | |
| "rewards/rollout_reward_func/mean": 2.402292251586914, | |
| "rewards/rollout_reward_func/std": 13.636407852172852, | |
| "sampling/importance_sampling_ratio/max": 1.3384240865707397, | |
| "sampling/importance_sampling_ratio/mean": 1.011613368988037, | |
| "sampling/importance_sampling_ratio/min": 0.776378870010376, | |
| "sampling/sampling_logp_difference/max": 0.2462749481201172, | |
| "sampling/sampling_logp_difference/mean": 0.009866164065897465, | |
| "step": 71, | |
| "step_time": 28.09002854999926 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.043560607358813286, | |
| "clip_ratio/high_mean": 0.013494318816810846, | |
| "clip_ratio/low_mean": 0.012428977759554982, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025923296343535185, | |
| "entropy": 0.20543431770056486, | |
| "epoch": 0.00144, | |
| "grad_norm": 0.23259234428405762, | |
| "kl": 0.5239685252308846, | |
| "learning_rate": 9.999997600006685e-05, | |
| "loss": -0.0218, | |
| "step": 72, | |
| "step_time": 8.672955195000668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 704.984375, | |
| "completions/mean_terminated_length": 704.984375, | |
| "completions/min_length": 533.0, | |
| "completions/min_terminated_length": 533.0, | |
| "entropy": 0.20190842729061842, | |
| "epoch": 0.00146, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3707256019115448, | |
| "kl": 0.5400361772626638, | |
| "learning_rate": 9.999997464821892e-05, | |
| "loss": 0.006, | |
| "num_tokens": 3692772.0, | |
| "reward": 2.049668073654175, | |
| "reward_std": 15.488001823425293, | |
| "rewards/rollout_reward_func/mean": 2.049668073654175, | |
| "rewards/rollout_reward_func/std": 15.380194664001465, | |
| "sampling/importance_sampling_ratio/max": 1.1559480428695679, | |
| "sampling/importance_sampling_ratio/mean": 0.970598578453064, | |
| "sampling/importance_sampling_ratio/min": 0.6524748802185059, | |
| "sampling/sampling_logp_difference/max": 0.35463929176330566, | |
| "sampling/sampling_logp_difference/mean": 0.009403295814990997, | |
| "step": 73, | |
| "step_time": 28.82742140900018 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06818182021379471, | |
| "clip_ratio/high_mean": 0.018347538425587118, | |
| "clip_ratio/low_mean": 0.02402935700956732, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04237689543515444, | |
| "entropy": 0.2009204039350152, | |
| "epoch": 0.00148, | |
| "grad_norm": 0.2297271341085434, | |
| "kl": 0.5404210295528173, | |
| "learning_rate": 9.999997325933408e-05, | |
| "loss": 0.001, | |
| "step": 74, | |
| "step_time": 7.489119195001422 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0015625000232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666767559946, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 810.0, | |
| "completions/max_terminated_length": 810.0, | |
| "completions/mean_length": 688.1875, | |
| "completions/mean_terminated_length": 688.1875, | |
| "completions/min_length": 607.0, | |
| "completions/min_terminated_length": 607.0, | |
| "entropy": 0.19008919596672058, | |
| "epoch": 0.0015, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.39288049936294556, | |
| "kl": 0.5065996870398521, | |
| "learning_rate": 9.999997183341232e-05, | |
| "loss": -0.0174, | |
| "num_tokens": 3789251.0, | |
| "reward": 5.712855339050293, | |
| "reward_std": 12.491518020629883, | |
| "rewards/rollout_reward_func/mean": 5.712855339050293, | |
| "rewards/rollout_reward_func/std": 13.803718566894531, | |
| "sampling/importance_sampling_ratio/max": 1.3862897157669067, | |
| "sampling/importance_sampling_ratio/mean": 0.9820230007171631, | |
| "sampling/importance_sampling_ratio/min": 0.7251328825950623, | |
| "sampling/sampling_logp_difference/max": 0.38344359397888184, | |
| "sampling/sampling_logp_difference/mean": 0.011255129240453243, | |
| "step": 75, | |
| "step_time": 29.904527067999425 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.014322916977107525, | |
| "clip_ratio/low_mean": 0.03125000069849193, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04557291744276881, | |
| "entropy": 0.1766198892146349, | |
| "epoch": 0.00152, | |
| "grad_norm": 0.24856555461883545, | |
| "kl": 0.5580815225839615, | |
| "learning_rate": 9.999997037045364e-05, | |
| "loss": -0.0236, | |
| "step": 76, | |
| "step_time": 7.936869918999946 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 829.0, | |
| "completions/max_terminated_length": 829.0, | |
| "completions/mean_length": 704.921875, | |
| "completions/mean_terminated_length": 704.921875, | |
| "completions/min_length": 290.0, | |
| "completions/min_terminated_length": 290.0, | |
| "entropy": 0.1587599003687501, | |
| "epoch": 0.00154, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5212773680686951, | |
| "kl": 0.5196739248931408, | |
| "learning_rate": 9.999996887045807e-05, | |
| "loss": -0.0035, | |
| "num_tokens": 3886377.0, | |
| "reward": 4.441685199737549, | |
| "reward_std": 10.929279327392578, | |
| "rewards/rollout_reward_func/mean": 4.441685676574707, | |
| "rewards/rollout_reward_func/std": 12.737987518310547, | |
| "sampling/importance_sampling_ratio/max": 1.4177803993225098, | |
| "sampling/importance_sampling_ratio/mean": 0.9960745573043823, | |
| "sampling/importance_sampling_ratio/min": 0.6403241157531738, | |
| "sampling/sampling_logp_difference/max": 0.35891127586364746, | |
| "sampling/sampling_logp_difference/mean": 0.009403642266988754, | |
| "step": 77, | |
| "step_time": 29.23442492700042 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03787878900766373, | |
| "clip_ratio/high_mean": 0.009469697251915932, | |
| "clip_ratio/low_mean": 0.02260890230536461, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03207859944086522, | |
| "entropy": 0.14319165889173746, | |
| "epoch": 0.00156, | |
| "grad_norm": 0.21223606169223785, | |
| "kl": 0.6083459779620171, | |
| "learning_rate": 9.999996733342559e-05, | |
| "loss": -0.0046, | |
| "step": 78, | |
| "step_time": 9.08587798599865 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0052083334885537624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 699.296875, | |
| "completions/mean_terminated_length": 699.296875, | |
| "completions/min_length": 445.0, | |
| "completions/min_terminated_length": 445.0, | |
| "entropy": 0.13262670719996095, | |
| "epoch": 0.00158, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3686180114746094, | |
| "kl": 0.5813394356518984, | |
| "learning_rate": 9.99999657593562e-05, | |
| "loss": 0.0239, | |
| "num_tokens": 3983088.0, | |
| "reward": 4.565882682800293, | |
| "reward_std": 10.690776824951172, | |
| "rewards/rollout_reward_func/mean": 4.565882682800293, | |
| "rewards/rollout_reward_func/std": 10.94388484954834, | |
| "sampling/importance_sampling_ratio/max": 2.301131010055542, | |
| "sampling/importance_sampling_ratio/mean": 1.038649559020996, | |
| "sampling/importance_sampling_ratio/min": 0.6781718730926514, | |
| "sampling/sampling_logp_difference/max": 0.7350552082061768, | |
| "sampling/sampling_logp_difference/mean": 0.009047108702361584, | |
| "step": 79, | |
| "step_time": 29.03609049600027 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0691287899389863, | |
| "clip_ratio/high_mean": 0.02249053120613098, | |
| "clip_ratio/low_mean": 0.017282197484746575, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03977272880729288, | |
| "entropy": 0.1365647497586906, | |
| "epoch": 0.0016, | |
| "grad_norm": 0.26296547055244446, | |
| "kl": 0.5871373657137156, | |
| "learning_rate": 9.99999641482499e-05, | |
| "loss": 0.0196, | |
| "step": 80, | |
| "step_time": 8.78226529199901 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 817.0, | |
| "completions/max_terminated_length": 817.0, | |
| "completions/mean_length": 704.28125, | |
| "completions/mean_terminated_length": 704.28125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "entropy": 0.13410852942615747, | |
| "epoch": 0.00162, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6391101479530334, | |
| "kl": 0.49469868279993534, | |
| "learning_rate": 9.999996250010672e-05, | |
| "loss": -0.0038, | |
| "num_tokens": 4080648.0, | |
| "reward": 5.768660545349121, | |
| "reward_std": 10.985546112060547, | |
| "rewards/rollout_reward_func/mean": 5.768660068511963, | |
| "rewards/rollout_reward_func/std": 11.962743759155273, | |
| "sampling/importance_sampling_ratio/max": 1.4244225025177002, | |
| "sampling/importance_sampling_ratio/mean": 1.0141850709915161, | |
| "sampling/importance_sampling_ratio/min": 0.6568657755851746, | |
| "sampling/sampling_logp_difference/max": 0.3986610174179077, | |
| "sampling/sampling_logp_difference/mean": 0.009017249569296837, | |
| "step": 81, | |
| "step_time": 29.058386802999394 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06250000186264515, | |
| "clip_ratio/high_mean": 0.016927083721384406, | |
| "clip_ratio/low_mean": 0.025213068933226168, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04214015288744122, | |
| "entropy": 0.1406740453094244, | |
| "epoch": 0.00164, | |
| "grad_norm": 0.3618878424167633, | |
| "kl": 0.5326054207980633, | |
| "learning_rate": 9.99999608149266e-05, | |
| "loss": -0.0092, | |
| "step": 82, | |
| "step_time": 7.4923771910011965 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 690.765625, | |
| "completions/mean_terminated_length": 690.765625, | |
| "completions/min_length": 619.0, | |
| "completions/min_terminated_length": 619.0, | |
| "entropy": 0.13702308759093285, | |
| "epoch": 0.00166, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7054314017295837, | |
| "kl": 0.5327184200286865, | |
| "learning_rate": 9.999995909270962e-05, | |
| "loss": 0.0131, | |
| "num_tokens": 4176944.0, | |
| "reward": 6.398343563079834, | |
| "reward_std": 12.486600875854492, | |
| "rewards/rollout_reward_func/mean": 6.398343563079834, | |
| "rewards/rollout_reward_func/std": 13.118927955627441, | |
| "sampling/importance_sampling_ratio/max": 1.1626012325286865, | |
| "sampling/importance_sampling_ratio/mean": 0.9923787117004395, | |
| "sampling/importance_sampling_ratio/min": 0.6767197847366333, | |
| "sampling/sampling_logp_difference/max": 0.27681541442871094, | |
| "sampling/sampling_logp_difference/mean": 0.007814774289727211, | |
| "step": 83, | |
| "step_time": 30.334892443000626 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.052083334885537624, | |
| "clip_ratio/high_mean": 0.014322917093522847, | |
| "clip_ratio/low_mean": 0.02935606148093939, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04367897834163159, | |
| "entropy": 0.13890184368938208, | |
| "epoch": 0.00168, | |
| "grad_norm": 0.23295485973358154, | |
| "kl": 0.583111148327589, | |
| "learning_rate": 9.999995733345573e-05, | |
| "loss": 0.0096, | |
| "step": 84, | |
| "step_time": 8.188888645999668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0014204545877873898, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0027225379599258304, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 847.0, | |
| "completions/max_terminated_length": 847.0, | |
| "completions/mean_length": 709.140625, | |
| "completions/mean_terminated_length": 709.140625, | |
| "completions/min_length": 385.0, | |
| "completions/min_terminated_length": 385.0, | |
| "entropy": 0.1646800385788083, | |
| "epoch": 0.0017, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.721032977104187, | |
| "kl": 0.5625268556177616, | |
| "learning_rate": 9.999995553716494e-05, | |
| "loss": -0.003, | |
| "num_tokens": 4273965.0, | |
| "reward": 5.8386735916137695, | |
| "reward_std": 13.300103187561035, | |
| "rewards/rollout_reward_func/mean": 5.8386735916137695, | |
| "rewards/rollout_reward_func/std": 13.629975318908691, | |
| "sampling/importance_sampling_ratio/max": 1.314743995666504, | |
| "sampling/importance_sampling_ratio/mean": 1.0051491260528564, | |
| "sampling/importance_sampling_ratio/min": 0.7047513127326965, | |
| "sampling/sampling_logp_difference/max": 0.2584061622619629, | |
| "sampling/sampling_logp_difference/mean": 0.009669218212366104, | |
| "step": 85, | |
| "step_time": 28.418585942000846 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06912878947332501, | |
| "clip_ratio/high_mean": 0.019886364112608135, | |
| "clip_ratio/low_mean": 0.04139046813361347, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.061276831780560315, | |
| "entropy": 0.16476231161504984, | |
| "epoch": 0.00172, | |
| "grad_norm": 0.3757534921169281, | |
| "kl": 0.6113345008343458, | |
| "learning_rate": 9.999995370383726e-05, | |
| "loss": -0.0069, | |
| "step": 86, | |
| "step_time": 8.756163650000417 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011363636702299118, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00414299254771322, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 684.84375, | |
| "completions/mean_terminated_length": 684.84375, | |
| "completions/min_length": 619.0, | |
| "completions/min_terminated_length": 619.0, | |
| "entropy": 0.15467680245637894, | |
| "epoch": 0.00174, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.39151129126548767, | |
| "kl": 0.5371765866875648, | |
| "learning_rate": 9.999995183347267e-05, | |
| "loss": 0.0105, | |
| "num_tokens": 4369299.0, | |
| "reward": 5.963912010192871, | |
| "reward_std": 12.684013366699219, | |
| "rewards/rollout_reward_func/mean": 5.963912010192871, | |
| "rewards/rollout_reward_func/std": 13.017167091369629, | |
| "sampling/importance_sampling_ratio/max": 1.2570720911026, | |
| "sampling/importance_sampling_ratio/mean": 1.0000150203704834, | |
| "sampling/importance_sampling_ratio/min": 0.6576955914497375, | |
| "sampling/sampling_logp_difference/max": 0.23494529724121094, | |
| "sampling/sampling_logp_difference/mean": 0.009037286043167114, | |
| "step": 87, | |
| "step_time": 27.959497561998433 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04876894038170576, | |
| "clip_ratio/high_mean": 0.014796401956118643, | |
| "clip_ratio/low_mean": 0.030184660223312676, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04498106241226196, | |
| "entropy": 0.1502314694225788, | |
| "epoch": 0.00176, | |
| "grad_norm": 0.24433566629886627, | |
| "kl": 0.518398828804493, | |
| "learning_rate": 9.999994992607121e-05, | |
| "loss": 0.0052, | |
| "step": 88, | |
| "step_time": 6.98385682199978 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 808.0, | |
| "completions/max_terminated_length": 808.0, | |
| "completions/mean_length": 674.84375, | |
| "completions/mean_terminated_length": 674.84375, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "entropy": 0.1636304627172649, | |
| "epoch": 0.00178, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4070056080818176, | |
| "kl": 0.44829913787543774, | |
| "learning_rate": 9.999994798163285e-05, | |
| "loss": 0.0028, | |
| "num_tokens": 4464636.0, | |
| "reward": 4.596271991729736, | |
| "reward_std": 12.002615928649902, | |
| "rewards/rollout_reward_func/mean": 4.5962724685668945, | |
| "rewards/rollout_reward_func/std": 12.03700065612793, | |
| "sampling/importance_sampling_ratio/max": 1.8310773372650146, | |
| "sampling/importance_sampling_ratio/mean": 1.0015285015106201, | |
| "sampling/importance_sampling_ratio/min": 0.6802361011505127, | |
| "sampling/sampling_logp_difference/max": 0.63387131690979, | |
| "sampling/sampling_logp_difference/mean": 0.01002519205212593, | |
| "step": 89, | |
| "step_time": 29.20652451100068 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.053503789473325014, | |
| "clip_ratio/high_mean": 0.014678030740469694, | |
| "clip_ratio/low_mean": 0.014914773171767592, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02959280402865261, | |
| "entropy": 0.16931697819381952, | |
| "epoch": 0.0018, | |
| "grad_norm": 0.22567316889762878, | |
| "kl": 0.45854073390364647, | |
| "learning_rate": 9.999994600015763e-05, | |
| "loss": -0.0044, | |
| "step": 90, | |
| "step_time": 7.806028198999684 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003906250116415322, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 818.0, | |
| "completions/max_terminated_length": 818.0, | |
| "completions/mean_length": 690.921875, | |
| "completions/mean_terminated_length": 690.921875, | |
| "completions/min_length": 573.0, | |
| "completions/min_terminated_length": 573.0, | |
| "entropy": 0.17555938381701708, | |
| "epoch": 0.00182, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6369052529335022, | |
| "kl": 0.5130380634218454, | |
| "learning_rate": 9.99999439816455e-05, | |
| "loss": 0.0097, | |
| "num_tokens": 4560466.0, | |
| "reward": 4.266380786895752, | |
| "reward_std": 8.932316780090332, | |
| "rewards/rollout_reward_func/mean": 4.266380786895752, | |
| "rewards/rollout_reward_func/std": 9.506205558776855, | |
| "sampling/importance_sampling_ratio/max": 1.4317197799682617, | |
| "sampling/importance_sampling_ratio/mean": 0.9800074100494385, | |
| "sampling/importance_sampling_ratio/min": 0.6640469431877136, | |
| "sampling/sampling_logp_difference/max": 0.39695852994918823, | |
| "sampling/sampling_logp_difference/mean": 0.011851027607917786, | |
| "step": 91, | |
| "step_time": 30.03264877599986 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07812500186264515, | |
| "clip_ratio/high_mean": 0.027343750349245965, | |
| "clip_ratio/low_mean": 0.02604166732635349, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05338541814126074, | |
| "entropy": 0.16226398199796677, | |
| "epoch": 0.00184, | |
| "grad_norm": 0.49021783471107483, | |
| "kl": 0.678026232868433, | |
| "learning_rate": 9.999994192609649e-05, | |
| "loss": 0.0008, | |
| "step": 92, | |
| "step_time": 9.04058756600034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0026041667442768812, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003906250116415322, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 706.40625, | |
| "completions/mean_terminated_length": 706.40625, | |
| "completions/min_length": 297.0, | |
| "completions/min_terminated_length": 297.0, | |
| "entropy": 0.1653224742040038, | |
| "epoch": 0.00186, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5930750966072083, | |
| "kl": 0.533378497697413, | |
| "learning_rate": 9.999993983351059e-05, | |
| "loss": 0.0049, | |
| "num_tokens": 4657400.0, | |
| "reward": 4.687631607055664, | |
| "reward_std": 12.176762580871582, | |
| "rewards/rollout_reward_func/mean": 4.687631607055664, | |
| "rewards/rollout_reward_func/std": 13.946465492248535, | |
| "sampling/importance_sampling_ratio/max": 2.0640549659729004, | |
| "sampling/importance_sampling_ratio/mean": 1.0510772466659546, | |
| "sampling/importance_sampling_ratio/min": 0.6677830219268799, | |
| "sampling/sampling_logp_difference/max": 0.5909380912780762, | |
| "sampling/sampling_logp_difference/mean": 0.01191171444952488, | |
| "step": 93, | |
| "step_time": 28.226474250999672 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.011718750349245965, | |
| "clip_ratio/low_mean": 0.022135417442768812, | |
| "clip_ratio/low_min": 0.0052083334885537624, | |
| "clip_ratio/region_mean": 0.033854167675599456, | |
| "entropy": 0.15958264330402017, | |
| "epoch": 0.00188, | |
| "grad_norm": 0.35930758714675903, | |
| "kl": 0.7466034032404423, | |
| "learning_rate": 9.999993770388783e-05, | |
| "loss": 0.0032, | |
| "step": 94, | |
| "step_time": 8.234778083000037 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 824.0, | |
| "completions/max_terminated_length": 824.0, | |
| "completions/mean_length": 697.828125, | |
| "completions/mean_terminated_length": 697.828125, | |
| "completions/min_length": 623.0, | |
| "completions/min_terminated_length": 623.0, | |
| "entropy": 0.16395951714366674, | |
| "epoch": 0.0019, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.30130699276924133, | |
| "kl": 0.4953720346093178, | |
| "learning_rate": 9.99999355372282e-05, | |
| "loss": 0.0087, | |
| "num_tokens": 4753836.0, | |
| "reward": 4.204550743103027, | |
| "reward_std": 11.951547622680664, | |
| "rewards/rollout_reward_func/mean": 4.204550743103027, | |
| "rewards/rollout_reward_func/std": 13.192495346069336, | |
| "sampling/importance_sampling_ratio/max": 1.7262465953826904, | |
| "sampling/importance_sampling_ratio/mean": 1.0100435018539429, | |
| "sampling/importance_sampling_ratio/min": 0.6937407851219177, | |
| "sampling/sampling_logp_difference/max": 0.5034514665603638, | |
| "sampling/sampling_logp_difference/mean": 0.008402319625020027, | |
| "step": 95, | |
| "step_time": 29.812369647001105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833441987634, | |
| "clip_ratio/high_mean": 0.010416666977107525, | |
| "clip_ratio/low_mean": 0.009114583604969084, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01953125058207661, | |
| "entropy": 0.18544823909178376, | |
| "epoch": 0.00192, | |
| "grad_norm": 0.2023509442806244, | |
| "kl": 0.44245083443820477, | |
| "learning_rate": 9.999993333353168e-05, | |
| "loss": 0.0061, | |
| "step": 96, | |
| "step_time": 7.195571093998296 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 829.0, | |
| "completions/max_terminated_length": 829.0, | |
| "completions/mean_length": 687.921875, | |
| "completions/mean_terminated_length": 687.921875, | |
| "completions/min_length": 596.0, | |
| "completions/min_terminated_length": 596.0, | |
| "entropy": 0.20835321862250566, | |
| "epoch": 0.00194, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.35837119817733765, | |
| "kl": 0.4389466196298599, | |
| "learning_rate": 9.999993109279828e-05, | |
| "loss": 0.0044, | |
| "num_tokens": 4849131.0, | |
| "reward": 3.880918264389038, | |
| "reward_std": 8.090033531188965, | |
| "rewards/rollout_reward_func/mean": 3.880918264389038, | |
| "rewards/rollout_reward_func/std": 9.26294231414795, | |
| "sampling/importance_sampling_ratio/max": 1.2344332933425903, | |
| "sampling/importance_sampling_ratio/mean": 0.9644654989242554, | |
| "sampling/importance_sampling_ratio/min": 0.7370292544364929, | |
| "sampling/sampling_logp_difference/max": 0.29116082191467285, | |
| "sampling/sampling_logp_difference/mean": 0.009601429104804993, | |
| "step": 97, | |
| "step_time": 30.270599251999556 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.052083334885537624, | |
| "clip_ratio/high_mean": 0.016927083721384406, | |
| "clip_ratio/low_mean": 0.014559659757651389, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.031486743479035795, | |
| "entropy": 0.21354177221655846, | |
| "epoch": 0.00196, | |
| "grad_norm": 0.20351360738277435, | |
| "kl": 0.43841097690165043, | |
| "learning_rate": 9.999992881502804e-05, | |
| "loss": 0.0004, | |
| "step": 98, | |
| "step_time": 7.506363271999817 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 824.0, | |
| "completions/max_terminated_length": 824.0, | |
| "completions/mean_length": 706.140625, | |
| "completions/mean_terminated_length": 706.140625, | |
| "completions/min_length": 391.0, | |
| "completions/min_terminated_length": 391.0, | |
| "entropy": 0.21307788416743279, | |
| "epoch": 0.00198, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4080103039741516, | |
| "kl": 0.5334641952067614, | |
| "learning_rate": 9.99999265002209e-05, | |
| "loss": -0.003, | |
| "num_tokens": 4945915.0, | |
| "reward": 5.200403213500977, | |
| "reward_std": 14.344334602355957, | |
| "rewards/rollout_reward_func/mean": 5.200403213500977, | |
| "rewards/rollout_reward_func/std": 14.294367790222168, | |
| "sampling/importance_sampling_ratio/max": 1.2247880697250366, | |
| "sampling/importance_sampling_ratio/mean": 1.0129998922348022, | |
| "sampling/importance_sampling_ratio/min": 0.7771543860435486, | |
| "sampling/sampling_logp_difference/max": 0.23006606101989746, | |
| "sampling/sampling_logp_difference/mean": 0.00854739174246788, | |
| "step": 99, | |
| "step_time": 29.33993570499979 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625000465661287, | |
| "clip_ratio/high_mean": 0.006510416744276881, | |
| "clip_ratio/low_mean": 0.023555872030556202, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.030066288774833083, | |
| "entropy": 0.21015852224081755, | |
| "epoch": 0.002, | |
| "grad_norm": 0.2297798991203308, | |
| "kl": 0.5835338849574327, | |
| "learning_rate": 9.999992414837691e-05, | |
| "loss": -0.008, | |
| "step": 100, | |
| "step_time": 8.775622698999086 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 837.0, | |
| "completions/max_terminated_length": 837.0, | |
| "completions/mean_length": 711.671875, | |
| "completions/mean_terminated_length": 711.671875, | |
| "completions/min_length": 616.0, | |
| "completions/min_terminated_length": 616.0, | |
| "entropy": 0.2137407148256898, | |
| "epoch": 0.00202, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4429977834224701, | |
| "kl": 0.4639507979154587, | |
| "learning_rate": 9.999992175949606e-05, | |
| "loss": -0.0173, | |
| "num_tokens": 5042733.0, | |
| "reward": 3.351179838180542, | |
| "reward_std": 8.503268241882324, | |
| "rewards/rollout_reward_func/mean": 3.351179838180542, | |
| "rewards/rollout_reward_func/std": 8.948554039001465, | |
| "sampling/importance_sampling_ratio/max": 1.328324556350708, | |
| "sampling/importance_sampling_ratio/mean": 1.0001481771469116, | |
| "sampling/importance_sampling_ratio/min": 0.5792597532272339, | |
| "sampling/sampling_logp_difference/max": 0.4302701950073242, | |
| "sampling/sampling_logp_difference/mean": 0.008802896365523338, | |
| "step": 101, | |
| "step_time": 29.50366010900052 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0572916679084301, | |
| "clip_ratio/high_mean": 0.02083333407063037, | |
| "clip_ratio/low_mean": 0.021188447950407863, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04202178155537695, | |
| "entropy": 0.19818230718374252, | |
| "epoch": 0.00204, | |
| "grad_norm": 0.228831484913826, | |
| "kl": 0.523833503946662, | |
| "learning_rate": 9.999991933357836e-05, | |
| "loss": -0.0238, | |
| "step": 102, | |
| "step_time": 7.743058271999871 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 819.0, | |
| "completions/max_terminated_length": 819.0, | |
| "completions/mean_length": 679.5, | |
| "completions/mean_terminated_length": 679.5, | |
| "completions/min_length": 393.0, | |
| "completions/min_terminated_length": 393.0, | |
| "entropy": 0.16811883123591542, | |
| "epoch": 0.00206, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2717866897583008, | |
| "kl": 0.5116975158452988, | |
| "learning_rate": 9.999991687062378e-05, | |
| "loss": 0.0026, | |
| "num_tokens": 5137485.0, | |
| "reward": 3.233732223510742, | |
| "reward_std": 12.289377212524414, | |
| "rewards/rollout_reward_func/mean": 3.233732223510742, | |
| "rewards/rollout_reward_func/std": 14.167500495910645, | |
| "sampling/importance_sampling_ratio/max": 1.1452041864395142, | |
| "sampling/importance_sampling_ratio/mean": 0.9949536323547363, | |
| "sampling/importance_sampling_ratio/min": 0.8263934254646301, | |
| "sampling/sampling_logp_difference/max": 0.11179852485656738, | |
| "sampling/sampling_logp_difference/mean": 0.00560589786618948, | |
| "step": 103, | |
| "step_time": 28.410943980999036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833441987634, | |
| "clip_ratio/high_mean": 0.009114583604969084, | |
| "clip_ratio/low_mean": 0.036576704937033355, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04569128877483308, | |
| "entropy": 0.1520394361577928, | |
| "epoch": 0.00208, | |
| "grad_norm": 0.1855272352695465, | |
| "kl": 0.548751313239336, | |
| "learning_rate": 9.999991437063234e-05, | |
| "loss": -0.0007, | |
| "step": 104, | |
| "step_time": 7.630572153999765 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 821.0, | |
| "completions/max_terminated_length": 821.0, | |
| "completions/mean_length": 689.109375, | |
| "completions/mean_terminated_length": 689.109375, | |
| "completions/min_length": 379.0, | |
| "completions/min_terminated_length": 379.0, | |
| "entropy": 0.15672127809375525, | |
| "epoch": 0.0021, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.45729750394821167, | |
| "kl": 0.6510039251297712, | |
| "learning_rate": 9.999991183360407e-05, | |
| "loss": -0.011, | |
| "num_tokens": 5232831.0, | |
| "reward": 4.220555305480957, | |
| "reward_std": 10.952154159545898, | |
| "rewards/rollout_reward_func/mean": 4.220555305480957, | |
| "rewards/rollout_reward_func/std": 11.161866188049316, | |
| "sampling/importance_sampling_ratio/max": 1.289732813835144, | |
| "sampling/importance_sampling_ratio/mean": 0.9952840209007263, | |
| "sampling/importance_sampling_ratio/min": 0.6639890074729919, | |
| "sampling/sampling_logp_difference/max": 0.4248615503311157, | |
| "sampling/sampling_logp_difference/mean": 0.009283961728215218, | |
| "step": 105, | |
| "step_time": 29.269957731999057 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.006510416860692203, | |
| "clip_ratio/low_mean": 0.015625000232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.022135417093522847, | |
| "entropy": 0.15183987142518163, | |
| "epoch": 0.00212, | |
| "grad_norm": 0.19675187766551971, | |
| "kl": 0.7388164456933737, | |
| "learning_rate": 9.999990925953892e-05, | |
| "loss": -0.0165, | |
| "step": 106, | |
| "step_time": 7.576425396001014 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 817.0, | |
| "completions/max_terminated_length": 817.0, | |
| "completions/mean_length": 695.0625, | |
| "completions/mean_terminated_length": 695.0625, | |
| "completions/min_length": 619.0, | |
| "completions/min_terminated_length": 619.0, | |
| "entropy": 0.14494483266025782, | |
| "epoch": 0.00214, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4538170397281647, | |
| "kl": 0.678084384649992, | |
| "learning_rate": 9.999990664843695e-05, | |
| "loss": 0.0147, | |
| "num_tokens": 5328578.0, | |
| "reward": 9.525361061096191, | |
| "reward_std": 13.358152389526367, | |
| "rewards/rollout_reward_func/mean": 9.525361061096191, | |
| "rewards/rollout_reward_func/std": 14.251992225646973, | |
| "sampling/importance_sampling_ratio/max": 1.1812809705734253, | |
| "sampling/importance_sampling_ratio/mean": 0.9926539659500122, | |
| "sampling/importance_sampling_ratio/min": 0.7029387950897217, | |
| "sampling/sampling_logp_difference/max": 0.35564422607421875, | |
| "sampling/sampling_logp_difference/mean": 0.007083391770720482, | |
| "step": 107, | |
| "step_time": 28.04022229299926 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.046875000931322575, | |
| "clip_ratio/high_mean": 0.015625000349245965, | |
| "clip_ratio/low_mean": 0.015861742896959186, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03148674312978983, | |
| "entropy": 0.1589709185063839, | |
| "epoch": 0.00216, | |
| "grad_norm": 0.22844459116458893, | |
| "kl": 0.6251159347593784, | |
| "learning_rate": 9.999990400029812e-05, | |
| "loss": 0.0106, | |
| "step": 108, | |
| "step_time": 8.196292393000022 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 703.75, | |
| "completions/mean_terminated_length": 703.75, | |
| "completions/min_length": 522.0, | |
| "completions/min_terminated_length": 522.0, | |
| "entropy": 0.17329717054963112, | |
| "epoch": 0.00218, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3682776987552643, | |
| "kl": 0.6051198206841946, | |
| "learning_rate": 9.999990131512245e-05, | |
| "loss": 0.0061, | |
| "num_tokens": 5424927.0, | |
| "reward": 6.206368923187256, | |
| "reward_std": 10.578010559082031, | |
| "rewards/rollout_reward_func/mean": 6.206368923187256, | |
| "rewards/rollout_reward_func/std": 11.067666053771973, | |
| "sampling/importance_sampling_ratio/max": 1.4831089973449707, | |
| "sampling/importance_sampling_ratio/mean": 1.002763271331787, | |
| "sampling/importance_sampling_ratio/min": 0.7234499454498291, | |
| "sampling/sampling_logp_difference/max": 0.3583219051361084, | |
| "sampling/sampling_logp_difference/mean": 0.007746794261038303, | |
| "step": 109, | |
| "step_time": 27.809638274999543 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026988637167960405, | |
| "clip_ratio/high_mean": 0.010653409641236067, | |
| "clip_ratio/low_mean": 0.014441288309171796, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025094697950407863, | |
| "entropy": 0.17460143100470304, | |
| "epoch": 0.0022, | |
| "grad_norm": 0.1794712245464325, | |
| "kl": 0.6243367586284876, | |
| "learning_rate": 9.999989859290995e-05, | |
| "loss": 0.0027, | |
| "step": 110, | |
| "step_time": 7.0755484739993335 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 820.0, | |
| "completions/max_terminated_length": 820.0, | |
| "completions/mean_length": 697.578125, | |
| "completions/mean_terminated_length": 697.578125, | |
| "completions/min_length": 285.0, | |
| "completions/min_terminated_length": 285.0, | |
| "entropy": 0.17424820829182863, | |
| "epoch": 0.00222, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.42331627011299133, | |
| "kl": 0.586581215262413, | |
| "learning_rate": 9.99998958336606e-05, | |
| "loss": -0.0044, | |
| "num_tokens": 5520852.0, | |
| "reward": 3.5279414653778076, | |
| "reward_std": 14.582866668701172, | |
| "rewards/rollout_reward_func/mean": 3.5279414653778076, | |
| "rewards/rollout_reward_func/std": 15.890913963317871, | |
| "sampling/importance_sampling_ratio/max": 1.2239214181900024, | |
| "sampling/importance_sampling_ratio/mean": 0.9994624853134155, | |
| "sampling/importance_sampling_ratio/min": 0.6852503418922424, | |
| "sampling/sampling_logp_difference/max": 0.31956130266189575, | |
| "sampling/sampling_logp_difference/mean": 0.006933148950338364, | |
| "step": 111, | |
| "step_time": 29.204085013999247 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02651515230536461, | |
| "clip_ratio/high_mean": 0.006628788076341152, | |
| "clip_ratio/low_mean": 0.018129006726667285, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02475779491942376, | |
| "entropy": 0.1639441135339439, | |
| "epoch": 0.00224, | |
| "grad_norm": 0.19418354332447052, | |
| "kl": 0.650929281488061, | |
| "learning_rate": 9.999989303737441e-05, | |
| "loss": -0.0109, | |
| "step": 112, | |
| "step_time": 7.643361527999332 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 813.0, | |
| "completions/max_terminated_length": 813.0, | |
| "completions/mean_length": 693.09375, | |
| "completions/mean_terminated_length": 693.09375, | |
| "completions/min_length": 630.0, | |
| "completions/min_terminated_length": 630.0, | |
| "entropy": 0.14537212159484625, | |
| "epoch": 0.00226, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4737316071987152, | |
| "kl": 0.669768800958991, | |
| "learning_rate": 9.99998902040514e-05, | |
| "loss": 0.0169, | |
| "num_tokens": 5616460.0, | |
| "reward": 3.8732333183288574, | |
| "reward_std": 9.794268608093262, | |
| "rewards/rollout_reward_func/mean": 3.8732333183288574, | |
| "rewards/rollout_reward_func/std": 10.40365982055664, | |
| "sampling/importance_sampling_ratio/max": 1.1869113445281982, | |
| "sampling/importance_sampling_ratio/mean": 0.9964576959609985, | |
| "sampling/importance_sampling_ratio/min": 0.5200645923614502, | |
| "sampling/sampling_logp_difference/max": 0.6150112152099609, | |
| "sampling/sampling_logp_difference/mean": 0.007128065451979637, | |
| "step": 113, | |
| "step_time": 27.900884353000038 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.042140152771025896, | |
| "clip_ratio/high_mean": 0.011837121681310236, | |
| "clip_ratio/low_mean": 0.006510416860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01834753854200244, | |
| "entropy": 0.14897123211994767, | |
| "epoch": 0.00228, | |
| "grad_norm": 0.20660799741744995, | |
| "kl": 0.7189689762890339, | |
| "learning_rate": 9.999988733369157e-05, | |
| "loss": 0.0137, | |
| "step": 114, | |
| "step_time": 7.532232160000149 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 828.0, | |
| "completions/max_terminated_length": 828.0, | |
| "completions/mean_length": 689.0625, | |
| "completions/mean_terminated_length": 689.0625, | |
| "completions/min_length": 291.0, | |
| "completions/min_terminated_length": 291.0, | |
| "entropy": 0.16697307769209146, | |
| "epoch": 0.0023, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.37319666147232056, | |
| "kl": 0.6085000336170197, | |
| "learning_rate": 9.999988442629488e-05, | |
| "loss": -0.015, | |
| "num_tokens": 5711756.0, | |
| "reward": 3.845529079437256, | |
| "reward_std": 9.702705383300781, | |
| "rewards/rollout_reward_func/mean": 3.845529079437256, | |
| "rewards/rollout_reward_func/std": 9.905435562133789, | |
| "sampling/importance_sampling_ratio/max": 1.3216222524642944, | |
| "sampling/importance_sampling_ratio/mean": 1.0128694772720337, | |
| "sampling/importance_sampling_ratio/min": 0.7146333456039429, | |
| "sampling/sampling_logp_difference/max": 0.3742462396621704, | |
| "sampling/sampling_logp_difference/mean": 0.006911748554557562, | |
| "step": 115, | |
| "step_time": 29.116642522001257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.011718750349245965, | |
| "clip_ratio/low_mean": 0.020951705169864, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.032670455519109964, | |
| "entropy": 0.17214004416018724, | |
| "epoch": 0.00232, | |
| "grad_norm": 0.19411630928516388, | |
| "kl": 0.6454576198011637, | |
| "learning_rate": 9.99998814818614e-05, | |
| "loss": -0.0191, | |
| "step": 116, | |
| "step_time": 7.846242159999747 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0013020833721384406, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 815.0, | |
| "completions/max_terminated_length": 815.0, | |
| "completions/mean_length": 674.78125, | |
| "completions/mean_terminated_length": 674.78125, | |
| "completions/min_length": 273.0, | |
| "completions/min_terminated_length": 273.0, | |
| "entropy": 0.16358821745961905, | |
| "epoch": 0.00234, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3246734142303467, | |
| "kl": 0.5800395030528307, | |
| "learning_rate": 9.999987850039107e-05, | |
| "loss": 0.0099, | |
| "num_tokens": 5806145.0, | |
| "reward": 1.2733659744262695, | |
| "reward_std": 12.069713592529297, | |
| "rewards/rollout_reward_func/mean": 1.2733662128448486, | |
| "rewards/rollout_reward_func/std": 12.829185485839844, | |
| "sampling/importance_sampling_ratio/max": 1.306739330291748, | |
| "sampling/importance_sampling_ratio/mean": 1.0012977123260498, | |
| "sampling/importance_sampling_ratio/min": 0.8135073781013489, | |
| "sampling/sampling_logp_difference/max": 0.19866454601287842, | |
| "sampling/sampling_logp_difference/mean": 0.006336529273539782, | |
| "step": 117, | |
| "step_time": 27.930649275999258 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.006510416860692203, | |
| "clip_ratio/low_mean": 0.013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01953125058207661, | |
| "entropy": 0.16348634008318186, | |
| "epoch": 0.00236, | |
| "grad_norm": 0.11877016723155975, | |
| "kl": 0.587722685188055, | |
| "learning_rate": 9.999987548188396e-05, | |
| "loss": 0.0055, | |
| "step": 118, | |
| "step_time": 7.173724952000157 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 834.0, | |
| "completions/max_terminated_length": 834.0, | |
| "completions/mean_length": 685.921875, | |
| "completions/mean_terminated_length": 685.921875, | |
| "completions/min_length": 307.0, | |
| "completions/min_terminated_length": 307.0, | |
| "entropy": 0.17666231095790863, | |
| "epoch": 0.00238, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2972959578037262, | |
| "kl": 0.5733677446842194, | |
| "learning_rate": 9.999987242634001e-05, | |
| "loss": 0.0156, | |
| "num_tokens": 5901319.0, | |
| "reward": 6.098433494567871, | |
| "reward_std": 11.96851921081543, | |
| "rewards/rollout_reward_func/mean": 6.098433494567871, | |
| "rewards/rollout_reward_func/std": 14.112695693969727, | |
| "sampling/importance_sampling_ratio/max": 1.2103277444839478, | |
| "sampling/importance_sampling_ratio/mean": 1.0073938369750977, | |
| "sampling/importance_sampling_ratio/min": 0.7692804932594299, | |
| "sampling/sampling_logp_difference/max": 0.13658356666564941, | |
| "sampling/sampling_logp_difference/mean": 0.0063937013037502766, | |
| "step": 119, | |
| "step_time": 28.342584406000242 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.009114583488553762, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.011718750232830644, | |
| "entropy": 0.16604932164773345, | |
| "epoch": 0.0024, | |
| "grad_norm": 0.23078079521656036, | |
| "kl": 0.5974587891250849, | |
| "learning_rate": 9.999986933375924e-05, | |
| "loss": 0.0105, | |
| "step": 120, | |
| "step_time": 7.440147934999914 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0014204545877873898, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014204545877873898, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 827.0, | |
| "completions/max_terminated_length": 827.0, | |
| "completions/mean_length": 677.953125, | |
| "completions/mean_terminated_length": 677.953125, | |
| "completions/min_length": 197.0, | |
| "completions/min_terminated_length": 197.0, | |
| "entropy": 0.13314053160138428, | |
| "epoch": 0.00242, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2688275873661041, | |
| "kl": 0.6813949979841709, | |
| "learning_rate": 9.999986620414167e-05, | |
| "loss": -0.0055, | |
| "num_tokens": 5995970.0, | |
| "reward": 4.1811299324035645, | |
| "reward_std": 11.76725959777832, | |
| "rewards/rollout_reward_func/mean": 4.1811299324035645, | |
| "rewards/rollout_reward_func/std": 12.213129997253418, | |
| "sampling/importance_sampling_ratio/max": 1.4055489301681519, | |
| "sampling/importance_sampling_ratio/mean": 1.0007095336914062, | |
| "sampling/importance_sampling_ratio/min": 0.7907775640487671, | |
| "sampling/sampling_logp_difference/max": 0.2328205108642578, | |
| "sampling/sampling_logp_difference/mean": 0.0057580312713980675, | |
| "step": 121, | |
| "step_time": 25.73195371799966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03172348579391837, | |
| "clip_ratio/high_mean": 0.007930871448479593, | |
| "clip_ratio/low_mean": 0.007812500232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015743371681310236, | |
| "entropy": 0.12837151251733303, | |
| "epoch": 0.00244, | |
| "grad_norm": 0.19652943313121796, | |
| "kl": 0.6755912862718105, | |
| "learning_rate": 9.99998630374873e-05, | |
| "loss": -0.0109, | |
| "step": 122, | |
| "step_time": 7.963045050999881 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 777.0, | |
| "completions/max_terminated_length": 777.0, | |
| "completions/mean_length": 676.75, | |
| "completions/mean_terminated_length": 676.75, | |
| "completions/min_length": 277.0, | |
| "completions/min_terminated_length": 277.0, | |
| "entropy": 0.14288373803719878, | |
| "epoch": 0.00246, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5129408836364746, | |
| "kl": 0.6468502469360828, | |
| "learning_rate": 9.999985983379613e-05, | |
| "loss": -0.002, | |
| "num_tokens": 6090409.0, | |
| "reward": 5.090976238250732, | |
| "reward_std": 8.817068099975586, | |
| "rewards/rollout_reward_func/mean": 5.090975761413574, | |
| "rewards/rollout_reward_func/std": 9.348170280456543, | |
| "sampling/importance_sampling_ratio/max": 1.2873598337173462, | |
| "sampling/importance_sampling_ratio/mean": 0.9989021420478821, | |
| "sampling/importance_sampling_ratio/min": 0.8453167676925659, | |
| "sampling/sampling_logp_difference/max": 0.1934504508972168, | |
| "sampling/sampling_logp_difference/mean": 0.0064778015948832035, | |
| "step": 123, | |
| "step_time": 28.174620942000274 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026041667442768812, | |
| "clip_ratio/high_mean": 0.006510416860692203, | |
| "clip_ratio/low_mean": 0.02367424312978983, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03018465987406671, | |
| "entropy": 0.12851850083097816, | |
| "epoch": 0.00248, | |
| "grad_norm": 0.170461967587471, | |
| "kl": 0.6984463054686785, | |
| "learning_rate": 9.999985659306817e-05, | |
| "loss": -0.0077, | |
| "step": 124, | |
| "step_time": 6.415902794999965 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 686.765625, | |
| "completions/mean_terminated_length": 686.765625, | |
| "completions/min_length": 274.0, | |
| "completions/min_terminated_length": 274.0, | |
| "entropy": 0.12949980096891522, | |
| "epoch": 0.0025, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4389054477214813, | |
| "kl": 0.8373041488230228, | |
| "learning_rate": 9.999985331530339e-05, | |
| "loss": -0.0001, | |
| "num_tokens": 6185533.0, | |
| "reward": 6.523627281188965, | |
| "reward_std": 12.731056213378906, | |
| "rewards/rollout_reward_func/mean": 6.523627281188965, | |
| "rewards/rollout_reward_func/std": 13.220861434936523, | |
| "sampling/importance_sampling_ratio/max": 1.4951905012130737, | |
| "sampling/importance_sampling_ratio/mean": 1.0012614727020264, | |
| "sampling/importance_sampling_ratio/min": 0.7251157760620117, | |
| "sampling/sampling_logp_difference/max": 0.39764922857284546, | |
| "sampling/sampling_logp_difference/mean": 0.006425045896321535, | |
| "step": 125, | |
| "step_time": 27.883570014999805 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833441987634, | |
| "clip_ratio/high_mean": 0.009114583604969084, | |
| "clip_ratio/low_mean": 0.02854567370377481, | |
| "clip_ratio/low_min": 0.0052083334885537624, | |
| "clip_ratio/region_mean": 0.03766025695949793, | |
| "entropy": 0.11564141698181629, | |
| "epoch": 0.00252, | |
| "grad_norm": 0.24558016657829285, | |
| "kl": 1.0033343844115734, | |
| "learning_rate": 9.999985000050182e-05, | |
| "loss": -0.0041, | |
| "step": 126, | |
| "step_time": 6.9546678629999406 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0013020833721384406, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0013020833721384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 837.0, | |
| "completions/max_terminated_length": 837.0, | |
| "completions/mean_length": 683.515625, | |
| "completions/mean_terminated_length": 683.515625, | |
| "completions/min_length": 617.0, | |
| "completions/min_terminated_length": 617.0, | |
| "entropy": 0.10271549178287387, | |
| "epoch": 0.00254, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.48136869072914124, | |
| "kl": 0.8534571155905724, | |
| "learning_rate": 9.999984664866347e-05, | |
| "loss": 0.0132, | |
| "num_tokens": 6280443.0, | |
| "reward": 4.674668788909912, | |
| "reward_std": 11.713541030883789, | |
| "rewards/rollout_reward_func/mean": 4.67466926574707, | |
| "rewards/rollout_reward_func/std": 13.705061912536621, | |
| "sampling/importance_sampling_ratio/max": 1.1515135765075684, | |
| "sampling/importance_sampling_ratio/mean": 0.9829530715942383, | |
| "sampling/importance_sampling_ratio/min": 0.6125902533531189, | |
| "sampling/sampling_logp_difference/max": 0.4248628616333008, | |
| "sampling/sampling_logp_difference/mean": 0.00649910606443882, | |
| "step": 127, | |
| "step_time": 27.126788691001366 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833441987634, | |
| "clip_ratio/high_mean": 0.009114583604969084, | |
| "clip_ratio/low_mean": 0.015625000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02473958395421505, | |
| "entropy": 0.10252567520365119, | |
| "epoch": 0.00256, | |
| "grad_norm": 0.25049537420272827, | |
| "kl": 0.9629664830863476, | |
| "learning_rate": 9.999984325978833e-05, | |
| "loss": 0.0108, | |
| "step": 128, | |
| "step_time": 7.304040701002123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1019.0, | |
| "completions/max_terminated_length": 1019.0, | |
| "completions/mean_length": 959.40625, | |
| "completions/mean_terminated_length": 959.40625, | |
| "completions/min_length": 910.0, | |
| "completions/min_terminated_length": 910.0, | |
| "entropy": 0.13085902528837323, | |
| "epoch": 0.00258, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6536189913749695, | |
| "kl": 0.8716370463371277, | |
| "learning_rate": 9.99998398338764e-05, | |
| "loss": 0.0229, | |
| "num_tokens": 6393090.0, | |
| "reward": 5.6695556640625, | |
| "reward_std": 11.05074405670166, | |
| "rewards/rollout_reward_func/mean": 5.669555187225342, | |
| "rewards/rollout_reward_func/std": 12.366477966308594, | |
| "sampling/importance_sampling_ratio/max": 1.2895963191986084, | |
| "sampling/importance_sampling_ratio/mean": 1.023085355758667, | |
| "sampling/importance_sampling_ratio/min": 0.7725162506103516, | |
| "sampling/sampling_logp_difference/max": 0.30040407180786133, | |
| "sampling/sampling_logp_difference/mean": 0.008372966200113297, | |
| "step": 129, | |
| "step_time": 33.28244163999989 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07559524197131395, | |
| "clip_ratio/high_mean": 0.028273811331018806, | |
| "clip_ratio/low_mean": 0.02299107296857983, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.051264884416013956, | |
| "entropy": 0.14083249866962433, | |
| "epoch": 0.0026, | |
| "grad_norm": 0.30618196725845337, | |
| "kl": 0.9511819295585155, | |
| "learning_rate": 9.999983637092769e-05, | |
| "loss": 0.0154, | |
| "step": 130, | |
| "step_time": 8.346246693000012 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1034.0, | |
| "completions/max_terminated_length": 1034.0, | |
| "completions/mean_length": 954.09375, | |
| "completions/mean_terminated_length": 954.09375, | |
| "completions/min_length": 446.0, | |
| "completions/min_terminated_length": 446.0, | |
| "entropy": 0.1475105220451951, | |
| "epoch": 0.00262, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5288283228874207, | |
| "kl": 0.912773609161377, | |
| "learning_rate": 9.999983287094222e-05, | |
| "loss": -0.0212, | |
| "num_tokens": 6505385.0, | |
| "reward": 7.057158470153809, | |
| "reward_std": 10.295648574829102, | |
| "rewards/rollout_reward_func/mean": 7.057158470153809, | |
| "rewards/rollout_reward_func/std": 10.425559997558594, | |
| "sampling/importance_sampling_ratio/max": 1.3561307191848755, | |
| "sampling/importance_sampling_ratio/mean": 0.9847082495689392, | |
| "sampling/importance_sampling_ratio/min": 0.6632312536239624, | |
| "sampling/sampling_logp_difference/max": 0.22558808326721191, | |
| "sampling/sampling_logp_difference/mean": 0.00731184845790267, | |
| "step": 131, | |
| "step_time": 33.95278312799974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06726190773770213, | |
| "clip_ratio/high_mean": 0.02313988225068897, | |
| "clip_ratio/low_mean": 0.03020833560731262, | |
| "clip_ratio/low_min": 0.004166666883975267, | |
| "clip_ratio/region_mean": 0.0533482184400782, | |
| "entropy": 0.16657310537993908, | |
| "epoch": 0.00264, | |
| "grad_norm": 0.29056552052497864, | |
| "kl": 0.7771002501249313, | |
| "learning_rate": 9.999982933391997e-05, | |
| "loss": -0.0284, | |
| "step": 132, | |
| "step_time": 7.300914149999244 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0031250001629814506, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1041.0, | |
| "completions/max_terminated_length": 1041.0, | |
| "completions/mean_length": 929.96875, | |
| "completions/mean_terminated_length": 929.96875, | |
| "completions/min_length": 190.0, | |
| "completions/min_terminated_length": 190.0, | |
| "entropy": 0.18474403023719788, | |
| "epoch": 0.00266, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6026266813278198, | |
| "kl": 0.7070890348404646, | |
| "learning_rate": 9.999982575986094e-05, | |
| "loss": -0.0, | |
| "num_tokens": 6616176.0, | |
| "reward": 3.4366226196289062, | |
| "reward_std": 14.906189918518066, | |
| "rewards/rollout_reward_func/mean": 3.436622381210327, | |
| "rewards/rollout_reward_func/std": 16.053083419799805, | |
| "sampling/importance_sampling_ratio/max": 1.3064157962799072, | |
| "sampling/importance_sampling_ratio/mean": 1.0054032802581787, | |
| "sampling/importance_sampling_ratio/min": 0.5862367749214172, | |
| "sampling/sampling_logp_difference/max": 0.5461184978485107, | |
| "sampling/sampling_logp_difference/mean": 0.010052897036075592, | |
| "step": 133, | |
| "step_time": 32.881761665999875 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06369047937914729, | |
| "clip_ratio/high_mean": 0.022172620403580368, | |
| "clip_ratio/low_mean": 0.03557477821595967, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05774739931803197, | |
| "entropy": 0.18857589829713106, | |
| "epoch": 0.00268, | |
| "grad_norm": 0.2586621344089508, | |
| "kl": 0.8268643505871296, | |
| "learning_rate": 9.999982214876515e-05, | |
| "loss": -0.0078, | |
| "step": 134, | |
| "step_time": 7.700307692000479 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1035.0, | |
| "completions/max_terminated_length": 1035.0, | |
| "completions/mean_length": 949.171875, | |
| "completions/mean_terminated_length": 949.171875, | |
| "completions/min_length": 820.0, | |
| "completions/min_terminated_length": 820.0, | |
| "entropy": 0.21391641069203615, | |
| "epoch": 0.0027, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5073988437652588, | |
| "kl": 0.669338870793581, | |
| "learning_rate": 9.999981850063262e-05, | |
| "loss": -0.0078, | |
| "num_tokens": 6728116.0, | |
| "reward": 5.17537784576416, | |
| "reward_std": 13.093953132629395, | |
| "rewards/rollout_reward_func/mean": 5.175378322601318, | |
| "rewards/rollout_reward_func/std": 13.309264183044434, | |
| "sampling/importance_sampling_ratio/max": 1.3000229597091675, | |
| "sampling/importance_sampling_ratio/mean": 0.9869031310081482, | |
| "sampling/importance_sampling_ratio/min": 0.7261144518852234, | |
| "sampling/sampling_logp_difference/max": 0.1514453887939453, | |
| "sampling/sampling_logp_difference/mean": 0.008218428120017052, | |
| "step": 135, | |
| "step_time": 32.26767973099959 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06815476482734084, | |
| "clip_ratio/high_mean": 0.022321430151350796, | |
| "clip_ratio/low_mean": 0.04136904957704246, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06369047961197793, | |
| "entropy": 0.21292453352361917, | |
| "epoch": 0.00272, | |
| "grad_norm": 0.3758169710636139, | |
| "kl": 0.6697604712098837, | |
| "learning_rate": 9.99998148154633e-05, | |
| "loss": -0.0147, | |
| "step": 136, | |
| "step_time": 8.984081079998305 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1033.0, | |
| "completions/max_terminated_length": 1033.0, | |
| "completions/mean_length": 941.34375, | |
| "completions/mean_terminated_length": 941.34375, | |
| "completions/min_length": 618.0, | |
| "completions/min_terminated_length": 618.0, | |
| "entropy": 0.23286819364875555, | |
| "epoch": 0.00274, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6691973805427551, | |
| "kl": 0.6865591164678335, | |
| "learning_rate": 9.999981109325724e-05, | |
| "loss": 0.0217, | |
| "num_tokens": 6839571.0, | |
| "reward": 4.762706756591797, | |
| "reward_std": 11.42410659790039, | |
| "rewards/rollout_reward_func/mean": 4.762706756591797, | |
| "rewards/rollout_reward_func/std": 11.434100151062012, | |
| "sampling/importance_sampling_ratio/max": 1.5375083684921265, | |
| "sampling/importance_sampling_ratio/mean": 1.0140312910079956, | |
| "sampling/importance_sampling_ratio/min": 0.691352128982544, | |
| "sampling/sampling_logp_difference/max": 0.24680709838867188, | |
| "sampling/sampling_logp_difference/mean": 0.010121582075953484, | |
| "step": 137, | |
| "step_time": 31.738008715999968 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07113095559179783, | |
| "clip_ratio/high_mean": 0.022098215762525797, | |
| "clip_ratio/low_mean": 0.0486922818236053, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.07079049723688513, | |
| "entropy": 0.2127716289833188, | |
| "epoch": 0.00276, | |
| "grad_norm": 0.30709579586982727, | |
| "kl": 0.6930392682552338, | |
| "learning_rate": 9.999980733401442e-05, | |
| "loss": 0.0087, | |
| "step": 138, | |
| "step_time": 8.016800426000827 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1036.0, | |
| "completions/max_terminated_length": 1036.0, | |
| "completions/mean_length": 978.6875, | |
| "completions/mean_terminated_length": 978.6875, | |
| "completions/min_length": 407.0, | |
| "completions/min_terminated_length": 407.0, | |
| "entropy": 0.19525799248367548, | |
| "epoch": 0.00278, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7576553225517273, | |
| "kl": 0.7210239768028259, | |
| "learning_rate": 9.999980353773486e-05, | |
| "loss": 0.0087, | |
| "num_tokens": 6953628.0, | |
| "reward": 7.9931640625, | |
| "reward_std": 14.572214126586914, | |
| "rewards/rollout_reward_func/mean": 7.993164539337158, | |
| "rewards/rollout_reward_func/std": 15.543896675109863, | |
| "sampling/importance_sampling_ratio/max": 1.4368523359298706, | |
| "sampling/importance_sampling_ratio/mean": 1.020465612411499, | |
| "sampling/importance_sampling_ratio/min": 0.6616964340209961, | |
| "sampling/sampling_logp_difference/max": 0.3545997142791748, | |
| "sampling/sampling_logp_difference/mean": 0.009700989350676537, | |
| "step": 139, | |
| "step_time": 31.6874715999993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07712912419810891, | |
| "clip_ratio/high_mean": 0.02553228137549013, | |
| "clip_ratio/low_mean": 0.053521828493103385, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.07905411045067012, | |
| "entropy": 0.1893756091594696, | |
| "epoch": 0.0028, | |
| "grad_norm": 0.31711098551750183, | |
| "kl": 0.786970479413867, | |
| "learning_rate": 9.999979970441856e-05, | |
| "loss": -0.0032, | |
| "step": 140, | |
| "step_time": 8.092264081999474 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1037.0, | |
| "completions/max_terminated_length": 1037.0, | |
| "completions/mean_length": 955.375, | |
| "completions/mean_terminated_length": 955.375, | |
| "completions/min_length": 198.0, | |
| "completions/min_terminated_length": 198.0, | |
| "entropy": 0.17851338349282742, | |
| "epoch": 0.00282, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5795699954032898, | |
| "kl": 0.7661695275455713, | |
| "learning_rate": 9.999979583406551e-05, | |
| "loss": -0.0028, | |
| "num_tokens": 7066060.0, | |
| "reward": 5.970464706420898, | |
| "reward_std": 14.057101249694824, | |
| "rewards/rollout_reward_func/mean": 5.970464706420898, | |
| "rewards/rollout_reward_func/std": 15.589529991149902, | |
| "sampling/importance_sampling_ratio/max": 1.2306140661239624, | |
| "sampling/importance_sampling_ratio/mean": 1.0006752014160156, | |
| "sampling/importance_sampling_ratio/min": 0.7063568830490112, | |
| "sampling/sampling_logp_difference/max": 0.24321842193603516, | |
| "sampling/sampling_logp_difference/mean": 0.008507179096341133, | |
| "step": 141, | |
| "step_time": 31.519457149999653 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.10007440904155374, | |
| "clip_ratio/high_mean": 0.03335193661041558, | |
| "clip_ratio/low_mean": 0.04136905015911907, | |
| "clip_ratio/low_min": 0.004166666883975267, | |
| "clip_ratio/region_mean": 0.07472098711878061, | |
| "entropy": 0.16169621469452977, | |
| "epoch": 0.00284, | |
| "grad_norm": 0.21583755314350128, | |
| "kl": 0.8030649330466986, | |
| "learning_rate": 9.999979192667573e-05, | |
| "loss": -0.0127, | |
| "step": 142, | |
| "step_time": 8.37791394099986 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1048.0, | |
| "completions/max_terminated_length": 1048.0, | |
| "completions/mean_length": 965.875, | |
| "completions/mean_terminated_length": 965.875, | |
| "completions/min_length": 397.0, | |
| "completions/min_terminated_length": 397.0, | |
| "entropy": 0.1351936119608581, | |
| "epoch": 0.00286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6277215480804443, | |
| "kl": 0.6056302916258574, | |
| "learning_rate": 9.999978798224921e-05, | |
| "loss": -0.0037, | |
| "num_tokens": 7179154.0, | |
| "reward": 7.006319046020508, | |
| "reward_std": 16.71393394470215, | |
| "rewards/rollout_reward_func/mean": 7.006319522857666, | |
| "rewards/rollout_reward_func/std": 17.009944915771484, | |
| "sampling/importance_sampling_ratio/max": 1.4720120429992676, | |
| "sampling/importance_sampling_ratio/mean": 1.0313916206359863, | |
| "sampling/importance_sampling_ratio/min": 0.8535375595092773, | |
| "sampling/sampling_logp_difference/max": 0.33231019973754883, | |
| "sampling/sampling_logp_difference/mean": 0.007416378241032362, | |
| "step": 143, | |
| "step_time": 31.253298032000657 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03363095410168171, | |
| "clip_ratio/high_mean": 0.010565476841293275, | |
| "clip_ratio/low_mean": 0.022564054117538035, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03312953084241599, | |
| "entropy": 0.12869372498244047, | |
| "epoch": 0.00288, | |
| "grad_norm": 0.3357137143611908, | |
| "kl": 0.6441880892962217, | |
| "learning_rate": 9.999978400078598e-05, | |
| "loss": -0.011, | |
| "step": 144, | |
| "step_time": 8.612604698998894 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1033.0, | |
| "completions/max_terminated_length": 1033.0, | |
| "completions/mean_length": 981.234375, | |
| "completions/mean_terminated_length": 981.234375, | |
| "completions/min_length": 922.0, | |
| "completions/min_terminated_length": 922.0, | |
| "entropy": 0.14574182452633977, | |
| "epoch": 0.0029, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6256417632102966, | |
| "kl": 0.680026089772582, | |
| "learning_rate": 9.9999779982286e-05, | |
| "loss": 0.0066, | |
| "num_tokens": 7293276.0, | |
| "reward": 9.345416069030762, | |
| "reward_std": 12.761893272399902, | |
| "rewards/rollout_reward_func/mean": 9.345417022705078, | |
| "rewards/rollout_reward_func/std": 14.231216430664062, | |
| "sampling/importance_sampling_ratio/max": 1.2866383790969849, | |
| "sampling/importance_sampling_ratio/mean": 0.9946876764297485, | |
| "sampling/importance_sampling_ratio/min": 0.7063043117523193, | |
| "sampling/sampling_logp_difference/max": 0.3008323907852173, | |
| "sampling/sampling_logp_difference/mean": 0.007616790477186441, | |
| "step": 145, | |
| "step_time": 31.996783762999257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05424107378348708, | |
| "clip_ratio/high_mean": 0.020851935259997845, | |
| "clip_ratio/low_mean": 0.03377976384945214, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05463169957511127, | |
| "entropy": 0.14952234365046024, | |
| "epoch": 0.00292, | |
| "grad_norm": 0.3231852948665619, | |
| "kl": 0.7531629204750061, | |
| "learning_rate": 9.999977592674931e-05, | |
| "loss": -0.0032, | |
| "step": 146, | |
| "step_time": 8.073437064001155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 1024.0, | |
| "completions/mean_length": 951.1875, | |
| "completions/mean_terminated_length": 951.1875, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "entropy": 0.14304543379694223, | |
| "epoch": 0.00294, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4544009566307068, | |
| "kl": 0.6561761032789946, | |
| "learning_rate": 9.999977183417592e-05, | |
| "loss": -0.0136, | |
| "num_tokens": 7405394.0, | |
| "reward": 9.592363357543945, | |
| "reward_std": 11.82339859008789, | |
| "rewards/rollout_reward_func/mean": 9.592363357543945, | |
| "rewards/rollout_reward_func/std": 12.213863372802734, | |
| "sampling/importance_sampling_ratio/max": 1.3994261026382446, | |
| "sampling/importance_sampling_ratio/mean": 0.9877851009368896, | |
| "sampling/importance_sampling_ratio/min": 0.5693183541297913, | |
| "sampling/sampling_logp_difference/max": 0.5401673913002014, | |
| "sampling/sampling_logp_difference/mean": 0.007635599002242088, | |
| "step": 147, | |
| "step_time": 31.870756492000055 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.054166669491678476, | |
| "clip_ratio/high_mean": 0.013541667372919619, | |
| "clip_ratio/low_mean": 0.036681550089269876, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05022321757860482, | |
| "entropy": 0.14903255039826035, | |
| "epoch": 0.00296, | |
| "grad_norm": 0.34076768159866333, | |
| "kl": 0.6760309524834156, | |
| "learning_rate": 9.99997677045658e-05, | |
| "loss": -0.0174, | |
| "step": 148, | |
| "step_time": 8.03263958799971 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0022435898426920176, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004326923284679651, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1022.0, | |
| "completions/max_terminated_length": 1022.0, | |
| "completions/mean_length": 950.578125, | |
| "completions/mean_terminated_length": 950.578125, | |
| "completions/min_length": 673.0, | |
| "completions/min_terminated_length": 673.0, | |
| "entropy": 0.16968106850981712, | |
| "epoch": 0.00298, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5638662576675415, | |
| "kl": 0.6232388503849506, | |
| "learning_rate": 9.999976353791898e-05, | |
| "loss": -0.0115, | |
| "num_tokens": 7517436.0, | |
| "reward": 6.506036281585693, | |
| "reward_std": 12.593399047851562, | |
| "rewards/rollout_reward_func/mean": 6.506035804748535, | |
| "rewards/rollout_reward_func/std": 13.552786827087402, | |
| "sampling/importance_sampling_ratio/max": 1.6476225852966309, | |
| "sampling/importance_sampling_ratio/mean": 0.9991188645362854, | |
| "sampling/importance_sampling_ratio/min": 0.5213066935539246, | |
| "sampling/sampling_logp_difference/max": 0.576519250869751, | |
| "sampling/sampling_logp_difference/mean": 0.01059242058545351, | |
| "step": 149, | |
| "step_time": 30.528242389000752 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05000000260770321, | |
| "clip_ratio/high_mean": 0.01458333432674408, | |
| "clip_ratio/low_mean": 0.03889938397333026, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05348271911498159, | |
| "entropy": 0.17780038248747587, | |
| "epoch": 0.003, | |
| "grad_norm": 0.5385463833808899, | |
| "kl": 0.8597960155457258, | |
| "learning_rate": 9.999975933423545e-05, | |
| "loss": -0.0172, | |
| "step": 150, | |
| "step_time": 8.0192518380004 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.002157738199457526, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003199404920451343, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1020.0, | |
| "completions/max_terminated_length": 1020.0, | |
| "completions/mean_length": 953.953125, | |
| "completions/mean_terminated_length": 953.953125, | |
| "completions/min_length": 664.0, | |
| "completions/min_terminated_length": 664.0, | |
| "entropy": 0.1825277367606759, | |
| "epoch": 0.00302, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6920294165611267, | |
| "kl": 0.6721424907445908, | |
| "learning_rate": 9.999975509351522e-05, | |
| "loss": -0.0165, | |
| "num_tokens": 7629697.0, | |
| "reward": 6.279596328735352, | |
| "reward_std": 13.454200744628906, | |
| "rewards/rollout_reward_func/mean": 6.279596328735352, | |
| "rewards/rollout_reward_func/std": 15.490900039672852, | |
| "sampling/importance_sampling_ratio/max": 1.2544176578521729, | |
| "sampling/importance_sampling_ratio/mean": 0.9968298673629761, | |
| "sampling/importance_sampling_ratio/min": 0.5891286730766296, | |
| "sampling/sampling_logp_difference/max": 0.36822509765625, | |
| "sampling/sampling_logp_difference/mean": 0.009644769132137299, | |
| "step": 151, | |
| "step_time": 30.041253716999563 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06250000279396772, | |
| "clip_ratio/high_mean": 0.02187500149011612, | |
| "clip_ratio/low_mean": 0.027847783756442368, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0497227858286351, | |
| "entropy": 0.19313342962414026, | |
| "epoch": 0.00304, | |
| "grad_norm": 0.3150973320007324, | |
| "kl": 0.6543413959443569, | |
| "learning_rate": 9.99997508157583e-05, | |
| "loss": -0.0263, | |
| "step": 152, | |
| "step_time": 8.048088266000377 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 993.0, | |
| "completions/max_terminated_length": 993.0, | |
| "completions/mean_length": 933.640625, | |
| "completions/mean_terminated_length": 933.640625, | |
| "completions/min_length": 191.0, | |
| "completions/min_terminated_length": 191.0, | |
| "entropy": 0.1851256461814046, | |
| "epoch": 0.00306, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6191554665565491, | |
| "kl": 0.5646015591919422, | |
| "learning_rate": 9.999974650096467e-05, | |
| "loss": -0.0157, | |
| "num_tokens": 7740640.0, | |
| "reward": 7.951285362243652, | |
| "reward_std": 13.322220802307129, | |
| "rewards/rollout_reward_func/mean": 7.951285362243652, | |
| "rewards/rollout_reward_func/std": 15.29836654663086, | |
| "sampling/importance_sampling_ratio/max": 1.1902070045471191, | |
| "sampling/importance_sampling_ratio/mean": 0.9911805987358093, | |
| "sampling/importance_sampling_ratio/min": 0.6955353617668152, | |
| "sampling/sampling_logp_difference/max": 0.37529921531677246, | |
| "sampling/sampling_logp_difference/mean": 0.007848689332604408, | |
| "step": 153, | |
| "step_time": 30.541750664000574 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04301470750942826, | |
| "clip_ratio/high_mean": 0.013878677156753838, | |
| "clip_ratio/low_mean": 0.039536832249723375, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.053415509522892535, | |
| "entropy": 0.16637779865413904, | |
| "epoch": 0.00308, | |
| "grad_norm": 0.3494158089160919, | |
| "kl": 0.6059492044150829, | |
| "learning_rate": 9.999974214913437e-05, | |
| "loss": -0.0231, | |
| "step": 154, | |
| "step_time": 8.139173758999277 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1041.0, | |
| "completions/max_terminated_length": 1041.0, | |
| "completions/mean_length": 972.640625, | |
| "completions/mean_terminated_length": 972.640625, | |
| "completions/min_length": 935.0, | |
| "completions/min_terminated_length": 935.0, | |
| "entropy": 0.1503364727832377, | |
| "epoch": 0.0031, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6759209036827087, | |
| "kl": 0.6219805851578712, | |
| "learning_rate": 9.999973776026739e-05, | |
| "loss": 0.0152, | |
| "num_tokens": 7854154.0, | |
| "reward": 5.902735710144043, | |
| "reward_std": 12.42209243774414, | |
| "rewards/rollout_reward_func/mean": 5.902735710144043, | |
| "rewards/rollout_reward_func/std": 12.867145538330078, | |
| "sampling/importance_sampling_ratio/max": 1.4259474277496338, | |
| "sampling/importance_sampling_ratio/mean": 1.0006431341171265, | |
| "sampling/importance_sampling_ratio/min": 0.6987265348434448, | |
| "sampling/sampling_logp_difference/max": 0.35797882080078125, | |
| "sampling/sampling_logp_difference/mean": 0.008803295902907848, | |
| "step": 155, | |
| "step_time": 31.54653142600091 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.054464288521558046, | |
| "clip_ratio/high_mean": 0.018824405618943274, | |
| "clip_ratio/low_mean": 0.0364583358168602, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05528274178504944, | |
| "entropy": 0.1241895561106503, | |
| "epoch": 0.00312, | |
| "grad_norm": 0.955508828163147, | |
| "kl": 0.9998617265373468, | |
| "learning_rate": 9.999973333436372e-05, | |
| "loss": 0.017, | |
| "step": 156, | |
| "step_time": 7.910055370999544 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0006127451197244227, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016544118407182395, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1040.0, | |
| "completions/max_terminated_length": 1040.0, | |
| "completions/mean_length": 972.546875, | |
| "completions/mean_terminated_length": 972.546875, | |
| "completions/min_length": 305.0, | |
| "completions/min_terminated_length": 305.0, | |
| "entropy": 0.11084589222446084, | |
| "epoch": 0.00314, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9052144885063171, | |
| "kl": 0.9162529278546572, | |
| "learning_rate": 9.999972887142338e-05, | |
| "loss": 0.0236, | |
| "num_tokens": 7967770.0, | |
| "reward": 10.1655855178833, | |
| "reward_std": 15.845230102539062, | |
| "rewards/rollout_reward_func/mean": 10.1655855178833, | |
| "rewards/rollout_reward_func/std": 17.717178344726562, | |
| "sampling/importance_sampling_ratio/max": 1.5550763607025146, | |
| "sampling/importance_sampling_ratio/mean": 1.0152667760849, | |
| "sampling/importance_sampling_ratio/min": 0.6825421452522278, | |
| "sampling/sampling_logp_difference/max": 0.38708627223968506, | |
| "sampling/sampling_logp_difference/mean": 0.006948791444301605, | |
| "step": 157, | |
| "step_time": 30.977979516999312 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.041964287869632244, | |
| "clip_ratio/high_mean": 0.013616072130389512, | |
| "clip_ratio/low_mean": 0.019929535686969757, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03354560805018991, | |
| "entropy": 0.11029910668730736, | |
| "epoch": 0.00316, | |
| "grad_norm": 0.3586527705192566, | |
| "kl": 0.996163547039032, | |
| "learning_rate": 9.999972437144637e-05, | |
| "loss": 0.018, | |
| "step": 158, | |
| "step_time": 8.73399685899949 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1026.0, | |
| "completions/max_terminated_length": 1026.0, | |
| "completions/mean_length": 954.4375, | |
| "completions/mean_terminated_length": 954.4375, | |
| "completions/min_length": 686.0, | |
| "completions/min_terminated_length": 686.0, | |
| "entropy": 0.14973071590065956, | |
| "epoch": 0.00318, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7992783784866333, | |
| "kl": 0.5131530929356813, | |
| "learning_rate": 9.999971983443269e-05, | |
| "loss": -0.0019, | |
| "num_tokens": 8080082.0, | |
| "reward": 5.8201141357421875, | |
| "reward_std": 11.146739959716797, | |
| "rewards/rollout_reward_func/mean": 5.8201141357421875, | |
| "rewards/rollout_reward_func/std": 11.795808792114258, | |
| "sampling/importance_sampling_ratio/max": 1.2158492803573608, | |
| "sampling/importance_sampling_ratio/mean": 0.9923404455184937, | |
| "sampling/importance_sampling_ratio/min": 0.623603343963623, | |
| "sampling/sampling_logp_difference/max": 0.24274826049804688, | |
| "sampling/sampling_logp_difference/mean": 0.007134515792131424, | |
| "step": 159, | |
| "step_time": 31.143712819999564 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06250000232830644, | |
| "clip_ratio/high_mean": 0.017708334140479565, | |
| "clip_ratio/low_mean": 0.028382036020047963, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.046090369927696884, | |
| "entropy": 0.15424074092879891, | |
| "epoch": 0.0032, | |
| "grad_norm": 0.4114607274532318, | |
| "kl": 0.5258241277188063, | |
| "learning_rate": 9.999971526038235e-05, | |
| "loss": -0.0105, | |
| "step": 160, | |
| "step_time": 7.376053459000104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1066.0, | |
| "completions/max_terminated_length": 1066.0, | |
| "completions/mean_length": 964.390625, | |
| "completions/mean_terminated_length": 964.390625, | |
| "completions/min_length": 795.0, | |
| "completions/min_terminated_length": 795.0, | |
| "entropy": 0.14214739575982094, | |
| "epoch": 0.00322, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6593955159187317, | |
| "kl": 0.7137422636151314, | |
| "learning_rate": 9.999971064929537e-05, | |
| "loss": 0.0221, | |
| "num_tokens": 8193063.0, | |
| "reward": 7.681003093719482, | |
| "reward_std": 11.441247940063477, | |
| "rewards/rollout_reward_func/mean": 7.681002616882324, | |
| "rewards/rollout_reward_func/std": 13.56708812713623, | |
| "sampling/importance_sampling_ratio/max": 1.4164402484893799, | |
| "sampling/importance_sampling_ratio/mean": 1.0107839107513428, | |
| "sampling/importance_sampling_ratio/min": 0.6920035481452942, | |
| "sampling/sampling_logp_difference/max": 0.3535594940185547, | |
| "sampling/sampling_logp_difference/mean": 0.007559535559266806, | |
| "step": 161, | |
| "step_time": 32.16549203100021 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.045833335258066654, | |
| "clip_ratio/high_mean": 0.014657739084213972, | |
| "clip_ratio/low_mean": 0.033670345321297646, | |
| "clip_ratio/low_min": 0.004166666883975267, | |
| "clip_ratio/region_mean": 0.04832808405626565, | |
| "entropy": 0.1284659137018025, | |
| "epoch": 0.00324, | |
| "grad_norm": 0.44948309659957886, | |
| "kl": 0.8788620755076408, | |
| "learning_rate": 9.999970600117172e-05, | |
| "loss": 0.0155, | |
| "step": 162, | |
| "step_time": 8.349364119001166 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1028.0, | |
| "completions/max_terminated_length": 1028.0, | |
| "completions/mean_length": 958.1875, | |
| "completions/mean_terminated_length": 958.1875, | |
| "completions/min_length": 688.0, | |
| "completions/min_terminated_length": 688.0, | |
| "entropy": 0.1297779600135982, | |
| "epoch": 0.00326, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.45152774453163147, | |
| "kl": 0.6032252982258797, | |
| "learning_rate": 9.999970131601142e-05, | |
| "loss": -0.007, | |
| "num_tokens": 8305653.0, | |
| "reward": 9.560303688049316, | |
| "reward_std": 12.965145111083984, | |
| "rewards/rollout_reward_func/mean": 9.560302734375, | |
| "rewards/rollout_reward_func/std": 13.572053909301758, | |
| "sampling/importance_sampling_ratio/max": 1.3970085382461548, | |
| "sampling/importance_sampling_ratio/mean": 0.9942675828933716, | |
| "sampling/importance_sampling_ratio/min": 0.5912600755691528, | |
| "sampling/sampling_logp_difference/max": 0.43671131134033203, | |
| "sampling/sampling_logp_difference/mean": 0.006968793459236622, | |
| "step": 163, | |
| "step_time": 29.62484441499919 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04534313944168389, | |
| "clip_ratio/high_mean": 0.013419118302408606, | |
| "clip_ratio/low_mean": 0.028385418467223644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04180453671142459, | |
| "entropy": 0.12798475893214345, | |
| "epoch": 0.00328, | |
| "grad_norm": 0.37086573243141174, | |
| "kl": 0.5329502020031214, | |
| "learning_rate": 9.99996965938145e-05, | |
| "loss": -0.0114, | |
| "step": 164, | |
| "step_time": 9.19148286500058 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1068.0, | |
| "completions/max_terminated_length": 1068.0, | |
| "completions/mean_length": 958.28125, | |
| "completions/mean_terminated_length": 958.28125, | |
| "completions/min_length": 714.0, | |
| "completions/min_terminated_length": 714.0, | |
| "entropy": 0.14121837774291635, | |
| "epoch": 0.0033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.445486456155777, | |
| "kl": 0.6868670284748077, | |
| "learning_rate": 9.999969183458092e-05, | |
| "loss": 0.017, | |
| "num_tokens": 8418180.0, | |
| "reward": 6.036255836486816, | |
| "reward_std": 14.006401062011719, | |
| "rewards/rollout_reward_func/mean": 6.036255836486816, | |
| "rewards/rollout_reward_func/std": 15.667006492614746, | |
| "sampling/importance_sampling_ratio/max": 1.4084051847457886, | |
| "sampling/importance_sampling_ratio/mean": 0.9844825267791748, | |
| "sampling/importance_sampling_ratio/min": 0.6458684802055359, | |
| "sampling/sampling_logp_difference/max": 0.35437726974487305, | |
| "sampling/sampling_logp_difference/mean": 0.008984029293060303, | |
| "step": 165, | |
| "step_time": 30.86212910500126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.041964287869632244, | |
| "clip_ratio/high_mean": 0.012574405525811017, | |
| "clip_ratio/low_mean": 0.02604166802484542, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.038616073317825794, | |
| "entropy": 0.13694474566727877, | |
| "epoch": 0.00332, | |
| "grad_norm": 0.2597510814666748, | |
| "kl": 0.670884259045124, | |
| "learning_rate": 9.999968703831071e-05, | |
| "loss": 0.012, | |
| "step": 166, | |
| "step_time": 8.765868728999521 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1028.0, | |
| "completions/max_terminated_length": 1028.0, | |
| "completions/mean_length": 964.015625, | |
| "completions/mean_terminated_length": 964.015625, | |
| "completions/min_length": 773.0, | |
| "completions/min_terminated_length": 773.0, | |
| "entropy": 0.13714495720341802, | |
| "epoch": 0.00334, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.742760181427002, | |
| "kl": 0.5935596115887165, | |
| "learning_rate": 9.999968220500386e-05, | |
| "loss": 0.0264, | |
| "num_tokens": 8531148.0, | |
| "reward": 6.6519269943237305, | |
| "reward_std": 14.873868942260742, | |
| "rewards/rollout_reward_func/mean": 6.6519269943237305, | |
| "rewards/rollout_reward_func/std": 15.216424942016602, | |
| "sampling/importance_sampling_ratio/max": 1.4992643594741821, | |
| "sampling/importance_sampling_ratio/mean": 1.0216107368469238, | |
| "sampling/importance_sampling_ratio/min": 0.7036370635032654, | |
| "sampling/sampling_logp_difference/max": 0.351947546005249, | |
| "sampling/sampling_logp_difference/mean": 0.008944995701313019, | |
| "step": 167, | |
| "step_time": 30.057006109999747 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03750000195577741, | |
| "clip_ratio/high_mean": 0.013541667489334941, | |
| "clip_ratio/low_mean": 0.03437500225845724, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04791666998062283, | |
| "entropy": 0.13065697345882654, | |
| "epoch": 0.00336, | |
| "grad_norm": 8.381538391113281, | |
| "kl": 7.166379388421774, | |
| "learning_rate": 9.999967733466041e-05, | |
| "loss": 0.0808, | |
| "step": 168, | |
| "step_time": 8.213664751000124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1016.0, | |
| "completions/max_terminated_length": 1016.0, | |
| "completions/mean_length": 943.078125, | |
| "completions/mean_terminated_length": 943.078125, | |
| "completions/min_length": 868.0, | |
| "completions/min_terminated_length": 868.0, | |
| "entropy": 0.13596792286261916, | |
| "epoch": 0.00338, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6320606470108032, | |
| "kl": 0.5089176166802645, | |
| "learning_rate": 9.999967242728034e-05, | |
| "loss": -0.0005, | |
| "num_tokens": 8642652.0, | |
| "reward": 9.83786392211914, | |
| "reward_std": 12.724628448486328, | |
| "rewards/rollout_reward_func/mean": 9.83786392211914, | |
| "rewards/rollout_reward_func/std": 13.589927673339844, | |
| "sampling/importance_sampling_ratio/max": 1.5156316757202148, | |
| "sampling/importance_sampling_ratio/mean": 1.001371145248413, | |
| "sampling/importance_sampling_ratio/min": 0.75341796875, | |
| "sampling/sampling_logp_difference/max": 0.40897202491760254, | |
| "sampling/sampling_logp_difference/mean": 0.006749385967850685, | |
| "step": 169, | |
| "step_time": 30.052868118000788 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833334419876337, | |
| "clip_ratio/high_mean": 0.007291667046956718, | |
| "clip_ratio/low_mean": 0.03333333553746343, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04062500281725079, | |
| "entropy": 0.13328771898522973, | |
| "epoch": 0.0034, | |
| "grad_norm": 0.27786943316459656, | |
| "kl": 0.5417735707014799, | |
| "learning_rate": 9.999966748286363e-05, | |
| "loss": -0.004, | |
| "step": 170, | |
| "step_time": 7.808134698000686 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1033.0, | |
| "completions/max_terminated_length": 1033.0, | |
| "completions/mean_length": 977.46875, | |
| "completions/mean_terminated_length": 977.46875, | |
| "completions/min_length": 890.0, | |
| "completions/min_terminated_length": 890.0, | |
| "entropy": 0.14305478753522038, | |
| "epoch": 0.00342, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.477180153131485, | |
| "kl": 0.9006227869540453, | |
| "learning_rate": 9.999966250141033e-05, | |
| "loss": -0.016, | |
| "num_tokens": 8756508.0, | |
| "reward": 9.534229278564453, | |
| "reward_std": 10.647237777709961, | |
| "rewards/rollout_reward_func/mean": 9.534229278564453, | |
| "rewards/rollout_reward_func/std": 11.566615104675293, | |
| "sampling/importance_sampling_ratio/max": 1.4990143775939941, | |
| "sampling/importance_sampling_ratio/mean": 1.0070048570632935, | |
| "sampling/importance_sampling_ratio/min": 0.6254692077636719, | |
| "sampling/sampling_logp_difference/max": 0.4892125129699707, | |
| "sampling/sampling_logp_difference/mean": 0.008062894456088543, | |
| "step": 171, | |
| "step_time": 29.967204156000207 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03333333507180214, | |
| "clip_ratio/high_mean": 0.009375000605359674, | |
| "clip_ratio/low_mean": 0.03333333553746343, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.042708336492069066, | |
| "entropy": 0.13219841895624995, | |
| "epoch": 0.00344, | |
| "grad_norm": 0.2979583740234375, | |
| "kl": 0.9737532902508974, | |
| "learning_rate": 9.999965748292042e-05, | |
| "loss": -0.0247, | |
| "step": 172, | |
| "step_time": 8.450734508001005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0031250001629814506, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1043.0, | |
| "completions/max_terminated_length": 1043.0, | |
| "completions/mean_length": 977.578125, | |
| "completions/mean_terminated_length": 977.578125, | |
| "completions/min_length": 911.0, | |
| "completions/min_terminated_length": 911.0, | |
| "entropy": 0.13215081067755818, | |
| "epoch": 0.00346, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.570915699005127, | |
| "kl": 0.7707110401242971, | |
| "learning_rate": 9.999965242739393e-05, | |
| "loss": 0.0115, | |
| "num_tokens": 8870395.0, | |
| "reward": 7.963113784790039, | |
| "reward_std": 12.185734748840332, | |
| "rewards/rollout_reward_func/mean": 7.963113784790039, | |
| "rewards/rollout_reward_func/std": 12.419037818908691, | |
| "sampling/importance_sampling_ratio/max": 1.2637660503387451, | |
| "sampling/importance_sampling_ratio/mean": 0.9871397614479065, | |
| "sampling/importance_sampling_ratio/min": 0.6115806102752686, | |
| "sampling/sampling_logp_difference/max": 0.3316690921783447, | |
| "sampling/sampling_logp_difference/mean": 0.0069004204124212265, | |
| "step": 173, | |
| "step_time": 29.865270385998883 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05000000214204192, | |
| "clip_ratio/high_mean": 0.013541667256504297, | |
| "clip_ratio/low_mean": 0.025976563920266926, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03951823094394058, | |
| "entropy": 0.1266618687659502, | |
| "epoch": 0.00348, | |
| "grad_norm": 0.3126421570777893, | |
| "kl": 0.7724483050405979, | |
| "learning_rate": 9.999964733483083e-05, | |
| "loss": 0.0074, | |
| "step": 174, | |
| "step_time": 8.14785716599863 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1040.0, | |
| "completions/max_terminated_length": 1040.0, | |
| "completions/mean_length": 986.78125, | |
| "completions/mean_terminated_length": 986.78125, | |
| "completions/min_length": 814.0, | |
| "completions/min_terminated_length": 814.0, | |
| "entropy": 0.12785040121525526, | |
| "epoch": 0.0035, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4320923089981079, | |
| "kl": 0.5314337890595198, | |
| "learning_rate": 9.999964220523112e-05, | |
| "loss": 0.0134, | |
| "num_tokens": 8984945.0, | |
| "reward": 11.988597869873047, | |
| "reward_std": 11.876688957214355, | |
| "rewards/rollout_reward_func/mean": 11.988597869873047, | |
| "rewards/rollout_reward_func/std": 12.529437065124512, | |
| "sampling/importance_sampling_ratio/max": 1.5641355514526367, | |
| "sampling/importance_sampling_ratio/mean": 1.0155951976776123, | |
| "sampling/importance_sampling_ratio/min": 0.7307262420654297, | |
| "sampling/sampling_logp_difference/max": 0.28014975786209106, | |
| "sampling/sampling_logp_difference/mean": 0.006405924912542105, | |
| "step": 175, | |
| "step_time": 30.800439373998415 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025000001303851604, | |
| "clip_ratio/high_mean": 0.008333333767950535, | |
| "clip_ratio/low_mean": 0.015625001047737896, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023958335164934397, | |
| "entropy": 0.12503943219780922, | |
| "epoch": 0.00352, | |
| "grad_norm": 0.25414347648620605, | |
| "kl": 0.545308168977499, | |
| "learning_rate": 9.999963703859485e-05, | |
| "loss": 0.0068, | |
| "step": 176, | |
| "step_time": 8.294947108000088 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1032.0, | |
| "completions/max_terminated_length": 1032.0, | |
| "completions/mean_length": 947.78125, | |
| "completions/mean_terminated_length": 947.78125, | |
| "completions/min_length": 877.0, | |
| "completions/min_terminated_length": 877.0, | |
| "entropy": 0.11795077985152602, | |
| "epoch": 0.00354, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5428488850593567, | |
| "kl": 0.5484364293515682, | |
| "learning_rate": 9.9999631834922e-05, | |
| "loss": 0.0209, | |
| "num_tokens": 9096764.0, | |
| "reward": 7.462541580200195, | |
| "reward_std": 9.003820419311523, | |
| "rewards/rollout_reward_func/mean": 7.462541103363037, | |
| "rewards/rollout_reward_func/std": 9.709749221801758, | |
| "sampling/importance_sampling_ratio/max": 1.6056361198425293, | |
| "sampling/importance_sampling_ratio/mean": 1.0011367797851562, | |
| "sampling/importance_sampling_ratio/min": 0.6226766109466553, | |
| "sampling/sampling_logp_difference/max": 0.48480892181396484, | |
| "sampling/sampling_logp_difference/mean": 0.007405002135783434, | |
| "step": 177, | |
| "step_time": 30.438013943000442 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025000001303851604, | |
| "clip_ratio/high_mean": 0.006250000325962901, | |
| "clip_ratio/low_mean": 0.021875001140870154, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.028125001583248377, | |
| "entropy": 0.11180919618345797, | |
| "epoch": 0.00356, | |
| "grad_norm": 1.0773159265518188, | |
| "kl": 0.7693799175322056, | |
| "learning_rate": 9.999962659421255e-05, | |
| "loss": 0.0218, | |
| "step": 178, | |
| "step_time": 8.289468396000302 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005208333604969084, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1017.0, | |
| "completions/max_terminated_length": 1017.0, | |
| "completions/mean_length": 962.765625, | |
| "completions/mean_terminated_length": 962.765625, | |
| "completions/min_length": 893.0, | |
| "completions/min_terminated_length": 893.0, | |
| "entropy": 0.12420041672885418, | |
| "epoch": 0.00358, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5452980399131775, | |
| "kl": 0.5841826293617487, | |
| "learning_rate": 9.999962131646658e-05, | |
| "loss": 0.0223, | |
| "num_tokens": 9209601.0, | |
| "reward": 9.949074745178223, | |
| "reward_std": 11.123800277709961, | |
| "rewards/rollout_reward_func/mean": 9.949074745178223, | |
| "rewards/rollout_reward_func/std": 11.492538452148438, | |
| "sampling/importance_sampling_ratio/max": 1.846232295036316, | |
| "sampling/importance_sampling_ratio/mean": 1.0060797929763794, | |
| "sampling/importance_sampling_ratio/min": 0.692804217338562, | |
| "sampling/sampling_logp_difference/max": 0.6036995649337769, | |
| "sampling/sampling_logp_difference/mean": 0.0071367728523910046, | |
| "step": 179, | |
| "step_time": 29.633916566999687 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03750000195577741, | |
| "clip_ratio/high_mean": 0.014583334093913436, | |
| "clip_ratio/low_mean": 0.018824405735358596, | |
| "clip_ratio/low_min": 0.004166666883975267, | |
| "clip_ratio/region_mean": 0.033407740062102675, | |
| "entropy": 0.11627750238403678, | |
| "epoch": 0.0036, | |
| "grad_norm": 0.38062411546707153, | |
| "kl": 0.639982882887125, | |
| "learning_rate": 9.999961600168402e-05, | |
| "loss": 0.0192, | |
| "step": 180, | |
| "step_time": 8.508149862998835 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1031.0, | |
| "completions/max_terminated_length": 1031.0, | |
| "completions/mean_length": 964.46875, | |
| "completions/mean_terminated_length": 964.46875, | |
| "completions/min_length": 816.0, | |
| "completions/min_terminated_length": 816.0, | |
| "entropy": 0.10024931281805038, | |
| "epoch": 0.00362, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7843199968338013, | |
| "kl": 0.5290133021771908, | |
| "learning_rate": 9.999961064986489e-05, | |
| "loss": -0.0105, | |
| "num_tokens": 9322591.0, | |
| "reward": 9.743326187133789, | |
| "reward_std": 11.718559265136719, | |
| "rewards/rollout_reward_func/mean": 9.743326187133789, | |
| "rewards/rollout_reward_func/std": 11.767054557800293, | |
| "sampling/importance_sampling_ratio/max": 1.2395166158676147, | |
| "sampling/importance_sampling_ratio/mean": 0.9893835783004761, | |
| "sampling/importance_sampling_ratio/min": 0.7077917456626892, | |
| "sampling/sampling_logp_difference/max": 0.36174678802490234, | |
| "sampling/sampling_logp_difference/mean": 0.0061057801358401775, | |
| "step": 181, | |
| "step_time": 30.010152957000173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04583333572372794, | |
| "clip_ratio/high_mean": 0.01458333432674408, | |
| "clip_ratio/low_mean": 0.019791668048128486, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.034375002374872565, | |
| "entropy": 0.0898241214454174, | |
| "epoch": 0.00364, | |
| "grad_norm": 0.898304283618927, | |
| "kl": 1.3444663938134909, | |
| "learning_rate": 9.999960526100922e-05, | |
| "loss": -0.0074, | |
| "step": 182, | |
| "step_time": 8.117577253999116 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0011160714784637094, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00424107164144516, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1037.0, | |
| "completions/max_terminated_length": 1037.0, | |
| "completions/mean_length": 965.296875, | |
| "completions/mean_terminated_length": 965.296875, | |
| "completions/min_length": 887.0, | |
| "completions/min_terminated_length": 887.0, | |
| "entropy": 0.12110280524939299, | |
| "epoch": 0.00366, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.47400349378585815, | |
| "kl": 0.513819495216012, | |
| "learning_rate": 9.999959983511699e-05, | |
| "loss": 0.0011, | |
| "num_tokens": 9435640.0, | |
| "reward": 11.970619201660156, | |
| "reward_std": 16.7136287689209, | |
| "rewards/rollout_reward_func/mean": 11.970619201660156, | |
| "rewards/rollout_reward_func/std": 17.193565368652344, | |
| "sampling/importance_sampling_ratio/max": 1.4852927923202515, | |
| "sampling/importance_sampling_ratio/mean": 0.9956411123275757, | |
| "sampling/importance_sampling_ratio/min": 0.58425372838974, | |
| "sampling/sampling_logp_difference/max": 0.4939703941345215, | |
| "sampling/sampling_logp_difference/mean": 0.007358514238148928, | |
| "step": 183, | |
| "step_time": 30.018645907000064 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03750000195577741, | |
| "clip_ratio/high_mean": 0.01041666732635349, | |
| "clip_ratio/low_mean": 0.01875000086147338, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.029166668420657516, | |
| "entropy": 0.12478661234490573, | |
| "epoch": 0.00368, | |
| "grad_norm": 0.29323798418045044, | |
| "kl": 0.46843259409070015, | |
| "learning_rate": 9.999959437218822e-05, | |
| "loss": -0.0073, | |
| "step": 184, | |
| "step_time": 8.045792180003446 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1026.0, | |
| "completions/max_terminated_length": 1026.0, | |
| "completions/mean_length": 959.875, | |
| "completions/mean_terminated_length": 959.875, | |
| "completions/min_length": 676.0, | |
| "completions/min_terminated_length": 676.0, | |
| "entropy": 0.12372714094817638, | |
| "epoch": 0.0037, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.49018675088882446, | |
| "kl": 0.5567406937479973, | |
| "learning_rate": 9.999958887222293e-05, | |
| "loss": -0.0266, | |
| "num_tokens": 9548327.0, | |
| "reward": 8.300872802734375, | |
| "reward_std": 11.473505020141602, | |
| "rewards/rollout_reward_func/mean": 8.300872802734375, | |
| "rewards/rollout_reward_func/std": 13.137120246887207, | |
| "sampling/importance_sampling_ratio/max": 1.3434193134307861, | |
| "sampling/importance_sampling_ratio/mean": 1.0231890678405762, | |
| "sampling/importance_sampling_ratio/min": 0.8001201748847961, | |
| "sampling/sampling_logp_difference/max": 0.24235105514526367, | |
| "sampling/sampling_logp_difference/mean": 0.006944713182747364, | |
| "step": 185, | |
| "step_time": 30.03380806199948 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.058333336375653744, | |
| "clip_ratio/high_mean": 0.01770833437331021, | |
| "clip_ratio/low_mean": 0.012500000651925802, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03020833490882069, | |
| "entropy": 0.13152629090473056, | |
| "epoch": 0.00372, | |
| "grad_norm": 0.23521849513053894, | |
| "kl": 0.5634740013629198, | |
| "learning_rate": 9.999958333522109e-05, | |
| "loss": -0.0341, | |
| "step": 186, | |
| "step_time": 8.600791754000966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1045.0, | |
| "completions/max_terminated_length": 1045.0, | |
| "completions/mean_length": 945.21875, | |
| "completions/mean_terminated_length": 945.21875, | |
| "completions/min_length": 289.0, | |
| "completions/min_terminated_length": 289.0, | |
| "entropy": 0.1315653999336064, | |
| "epoch": 0.00374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.36500951647758484, | |
| "kl": 0.5178995914757252, | |
| "learning_rate": 9.999957776118273e-05, | |
| "loss": -0.0136, | |
| "num_tokens": 9660136.0, | |
| "reward": 7.931632041931152, | |
| "reward_std": 11.40542984008789, | |
| "rewards/rollout_reward_func/mean": 7.931632041931152, | |
| "rewards/rollout_reward_func/std": 12.151664733886719, | |
| "sampling/importance_sampling_ratio/max": 1.7536835670471191, | |
| "sampling/importance_sampling_ratio/mean": 1.001771092414856, | |
| "sampling/importance_sampling_ratio/min": 0.7216951251029968, | |
| "sampling/sampling_logp_difference/max": 0.5699708461761475, | |
| "sampling/sampling_logp_difference/mean": 0.0067958529107272625, | |
| "step": 187, | |
| "step_time": 29.347854906000975 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.054166669491678476, | |
| "clip_ratio/high_mean": 0.01770833448972553, | |
| "clip_ratio/low_mean": 0.025694445823319256, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04340278054587543, | |
| "entropy": 0.13414463540539145, | |
| "epoch": 0.00376, | |
| "grad_norm": 0.21745486557483673, | |
| "kl": 0.5746774040162563, | |
| "learning_rate": 9.999957215010784e-05, | |
| "loss": -0.019, | |
| "step": 188, | |
| "step_time": 8.856123159000163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1026.0, | |
| "completions/max_terminated_length": 1026.0, | |
| "completions/mean_length": 947.34375, | |
| "completions/mean_terminated_length": 947.34375, | |
| "completions/min_length": 216.0, | |
| "completions/min_terminated_length": 216.0, | |
| "entropy": 0.14527452224865556, | |
| "epoch": 0.00378, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4758737087249756, | |
| "kl": 0.6551676895469427, | |
| "learning_rate": 9.999956650199645e-05, | |
| "loss": -0.0064, | |
| "num_tokens": 9771998.0, | |
| "reward": 8.513150215148926, | |
| "reward_std": 14.811095237731934, | |
| "rewards/rollout_reward_func/mean": 8.513150215148926, | |
| "rewards/rollout_reward_func/std": 15.769759178161621, | |
| "sampling/importance_sampling_ratio/max": 1.4140323400497437, | |
| "sampling/importance_sampling_ratio/mean": 1.0076611042022705, | |
| "sampling/importance_sampling_ratio/min": 0.5691302418708801, | |
| "sampling/sampling_logp_difference/max": 0.7131770253181458, | |
| "sampling/sampling_logp_difference/mean": 0.009376442059874535, | |
| "step": 189, | |
| "step_time": 30.213357230003567 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.054166669491678476, | |
| "clip_ratio/high_mean": 0.014583334210328758, | |
| "clip_ratio/low_mean": 0.0281250016996637, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.042708336375653744, | |
| "entropy": 0.13438974926248193, | |
| "epoch": 0.0038, | |
| "grad_norm": 0.2324807345867157, | |
| "kl": 0.737682543694973, | |
| "learning_rate": 9.999956081684854e-05, | |
| "loss": -0.0149, | |
| "step": 190, | |
| "step_time": 7.734431613998822 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1019.0, | |
| "completions/max_terminated_length": 1019.0, | |
| "completions/mean_length": 962.203125, | |
| "completions/mean_terminated_length": 962.203125, | |
| "completions/min_length": 881.0, | |
| "completions/min_terminated_length": 881.0, | |
| "entropy": 0.1253855088725686, | |
| "epoch": 0.00382, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.41439345479011536, | |
| "kl": 0.709712341427803, | |
| "learning_rate": 9.999955509466414e-05, | |
| "loss": 0.0269, | |
| "num_tokens": 9884808.0, | |
| "reward": 9.057092666625977, | |
| "reward_std": 9.098945617675781, | |
| "rewards/rollout_reward_func/mean": 9.05709171295166, | |
| "rewards/rollout_reward_func/std": 10.38012981414795, | |
| "sampling/importance_sampling_ratio/max": 1.3585758209228516, | |
| "sampling/importance_sampling_ratio/mean": 0.989570677280426, | |
| "sampling/importance_sampling_ratio/min": 0.6827925443649292, | |
| "sampling/sampling_logp_difference/max": 0.40184950828552246, | |
| "sampling/sampling_logp_difference/mean": 0.00655590184032917, | |
| "step": 191, | |
| "step_time": 31.590866651999022 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03392857313156128, | |
| "clip_ratio/high_mean": 0.010565476841293275, | |
| "clip_ratio/low_mean": 0.0293154779355973, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0398809548933059, | |
| "entropy": 0.11225170968100429, | |
| "epoch": 0.00384, | |
| "grad_norm": 0.23349761962890625, | |
| "kl": 0.8278532009571791, | |
| "learning_rate": 9.999954933544323e-05, | |
| "loss": 0.0201, | |
| "step": 192, | |
| "step_time": 7.970918687003177 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1031.0, | |
| "completions/max_terminated_length": 1031.0, | |
| "completions/mean_length": 977.734375, | |
| "completions/mean_terminated_length": 977.734375, | |
| "completions/min_length": 896.0, | |
| "completions/min_terminated_length": 896.0, | |
| "entropy": 0.11717891087755561, | |
| "epoch": 0.00386, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4010028839111328, | |
| "kl": 0.6346222888678312, | |
| "learning_rate": 9.999954353918583e-05, | |
| "loss": 0.0125, | |
| "num_tokens": 9998710.0, | |
| "reward": 12.752401351928711, | |
| "reward_std": 15.009429931640625, | |
| "rewards/rollout_reward_func/mean": 12.752399444580078, | |
| "rewards/rollout_reward_func/std": 15.288240432739258, | |
| "sampling/importance_sampling_ratio/max": 1.3140867948532104, | |
| "sampling/importance_sampling_ratio/mean": 0.9636229276657104, | |
| "sampling/importance_sampling_ratio/min": 0.5537927746772766, | |
| "sampling/sampling_logp_difference/max": 0.36048221588134766, | |
| "sampling/sampling_logp_difference/mean": 0.007171455770730972, | |
| "step": 193, | |
| "step_time": 30.459012025998163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029166667722165585, | |
| "clip_ratio/high_mean": 0.007291666930541396, | |
| "clip_ratio/low_mean": 0.03020833502523601, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03750000218860805, | |
| "entropy": 0.11490702140145004, | |
| "epoch": 0.00388, | |
| "grad_norm": 0.23535722494125366, | |
| "kl": 0.6073946505784988, | |
| "learning_rate": 9.999953770589194e-05, | |
| "loss": 0.006, | |
| "step": 194, | |
| "step_time": 8.631377130000146 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1033.0, | |
| "completions/max_terminated_length": 1033.0, | |
| "completions/mean_length": 970.203125, | |
| "completions/mean_terminated_length": 970.203125, | |
| "completions/min_length": 898.0, | |
| "completions/min_terminated_length": 898.0, | |
| "entropy": 0.11113500501960516, | |
| "epoch": 0.0039, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.604935348033905, | |
| "kl": 0.6790309809148312, | |
| "learning_rate": 9.999953183556157e-05, | |
| "loss": 0.0026, | |
| "num_tokens": 10112081.0, | |
| "reward": 7.972203731536865, | |
| "reward_std": 13.011554718017578, | |
| "rewards/rollout_reward_func/mean": 7.972204208374023, | |
| "rewards/rollout_reward_func/std": 13.773921966552734, | |
| "sampling/importance_sampling_ratio/max": 1.3542617559432983, | |
| "sampling/importance_sampling_ratio/mean": 0.9855128526687622, | |
| "sampling/importance_sampling_ratio/min": 0.597061276435852, | |
| "sampling/sampling_logp_difference/max": 0.4635782241821289, | |
| "sampling/sampling_logp_difference/mean": 0.006834958214312792, | |
| "step": 195, | |
| "step_time": 30.052685054003632 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029166668187826872, | |
| "clip_ratio/high_mean": 0.007291667046956718, | |
| "clip_ratio/low_mean": 0.015625000814907253, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.022916667978279293, | |
| "entropy": 0.11238743201829493, | |
| "epoch": 0.00392, | |
| "grad_norm": 0.4268299341201782, | |
| "kl": 0.700402544811368, | |
| "learning_rate": 9.999952592819473e-05, | |
| "loss": -0.0015, | |
| "step": 196, | |
| "step_time": 8.260044886000287 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0032738096779212356, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005357143119908869, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1043.0, | |
| "completions/max_terminated_length": 1043.0, | |
| "completions/mean_length": 934.5, | |
| "completions/mean_terminated_length": 934.5, | |
| "completions/min_length": 878.0, | |
| "completions/min_terminated_length": 878.0, | |
| "entropy": 0.1125073074363172, | |
| "epoch": 0.00394, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6487244963645935, | |
| "kl": 0.6208249572664499, | |
| "learning_rate": 9.99995199837914e-05, | |
| "loss": 0.0046, | |
| "num_tokens": 10223022.0, | |
| "reward": 8.661399841308594, | |
| "reward_std": 15.73376178741455, | |
| "rewards/rollout_reward_func/mean": 8.661399841308594, | |
| "rewards/rollout_reward_func/std": 15.457544326782227, | |
| "sampling/importance_sampling_ratio/max": 1.324127435684204, | |
| "sampling/importance_sampling_ratio/mean": 1.0008368492126465, | |
| "sampling/importance_sampling_ratio/min": 0.6733382344245911, | |
| "sampling/sampling_logp_difference/max": 0.35140299797058105, | |
| "sampling/sampling_logp_difference/mean": 0.007979365065693855, | |
| "step": 197, | |
| "step_time": 31.166683005998493 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021130953449755907, | |
| "clip_ratio/high_mean": 0.00840773864183575, | |
| "clip_ratio/low_mean": 0.02730654936749488, | |
| "clip_ratio/low_min": 0.004166666883975267, | |
| "clip_ratio/region_mean": 0.035714288242161274, | |
| "entropy": 0.11146878870204091, | |
| "epoch": 0.00396, | |
| "grad_norm": 0.5962705016136169, | |
| "kl": 0.9501709761098027, | |
| "learning_rate": 9.999951400235163e-05, | |
| "loss": 0.004, | |
| "step": 198, | |
| "step_time": 8.287430281997331 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012797619681805372, | |
| "clip_ratio/high_mean": 0.003199404920451343, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005282738362438977, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1047.0, | |
| "completions/max_terminated_length": 1047.0, | |
| "completions/mean_length": 980.65625, | |
| "completions/mean_terminated_length": 980.65625, | |
| "completions/min_length": 902.0, | |
| "completions/min_terminated_length": 902.0, | |
| "entropy": 0.11779335234314203, | |
| "epoch": 0.00398, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.523526668548584, | |
| "kl": 0.5669353120028973, | |
| "learning_rate": 9.999950798387541e-05, | |
| "loss": 0.0049, | |
| "num_tokens": 10337112.0, | |
| "reward": 10.420181274414062, | |
| "reward_std": 16.354602813720703, | |
| "rewards/rollout_reward_func/mean": 10.420181274414062, | |
| "rewards/rollout_reward_func/std": 17.055269241333008, | |
| "sampling/importance_sampling_ratio/max": 1.23856782913208, | |
| "sampling/importance_sampling_ratio/mean": 0.9714287519454956, | |
| "sampling/importance_sampling_ratio/min": 0.7061982750892639, | |
| "sampling/sampling_logp_difference/max": 0.447023868560791, | |
| "sampling/sampling_logp_difference/mean": 0.00747651606798172, | |
| "step": 199, | |
| "step_time": 30.34761462899951 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029464287217706442, | |
| "clip_ratio/high_mean": 0.010491072083823383, | |
| "clip_ratio/low_mean": 0.02091703994665295, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.031408111681230366, | |
| "entropy": 0.11395548144355416, | |
| "epoch": 0.004, | |
| "grad_norm": 0.3284382224082947, | |
| "kl": 0.5632808655500412, | |
| "learning_rate": 9.999950192836271e-05, | |
| "loss": -0.001, | |
| "step": 200, | |
| "step_time": 8.547375084998748 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1034.0, | |
| "completions/max_terminated_length": 1034.0, | |
| "completions/mean_length": 971.3125, | |
| "completions/mean_terminated_length": 971.3125, | |
| "completions/min_length": 873.0, | |
| "completions/min_terminated_length": 873.0, | |
| "entropy": 0.1107462802901864, | |
| "epoch": 0.00402, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.42466774582862854, | |
| "kl": 0.504519259557128, | |
| "learning_rate": 9.999949583581359e-05, | |
| "loss": 0.0037, | |
| "num_tokens": 10450565.0, | |
| "reward": 12.199589729309082, | |
| "reward_std": 12.77005672454834, | |
| "rewards/rollout_reward_func/mean": 12.199588775634766, | |
| "rewards/rollout_reward_func/std": 13.816198348999023, | |
| "sampling/importance_sampling_ratio/max": 1.1825975179672241, | |
| "sampling/importance_sampling_ratio/mean": 0.9908883571624756, | |
| "sampling/importance_sampling_ratio/min": 0.6934873461723328, | |
| "sampling/sampling_logp_difference/max": 0.3765444755554199, | |
| "sampling/sampling_logp_difference/mean": 0.006183322053402662, | |
| "step": 201, | |
| "step_time": 30.161383785001817 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03750000195577741, | |
| "clip_ratio/high_mean": 0.011458334047347307, | |
| "clip_ratio/low_mean": 0.021875001257285476, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03333333553746343, | |
| "entropy": 0.10510897357016802, | |
| "epoch": 0.00404, | |
| "grad_norm": 0.21419784426689148, | |
| "kl": 0.5648845955729485, | |
| "learning_rate": 9.999948970622802e-05, | |
| "loss": -0.0012, | |
| "step": 202, | |
| "step_time": 8.714965140998174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1046.0, | |
| "completions/max_terminated_length": 1046.0, | |
| "completions/mean_length": 979.578125, | |
| "completions/mean_terminated_length": 979.578125, | |
| "completions/min_length": 294.0, | |
| "completions/min_terminated_length": 294.0, | |
| "entropy": 0.12641333835199475, | |
| "epoch": 0.00406, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6306821703910828, | |
| "kl": 0.5104430429637432, | |
| "learning_rate": 9.9999483539606e-05, | |
| "loss": -0.0021, | |
| "num_tokens": 10564630.0, | |
| "reward": 10.778827667236328, | |
| "reward_std": 13.483461380004883, | |
| "rewards/rollout_reward_func/mean": 10.778827667236328, | |
| "rewards/rollout_reward_func/std": 14.313225746154785, | |
| "sampling/importance_sampling_ratio/max": 1.4068244695663452, | |
| "sampling/importance_sampling_ratio/mean": 0.9891500473022461, | |
| "sampling/importance_sampling_ratio/min": 0.6753217577934265, | |
| "sampling/sampling_logp_difference/max": 0.3969893455505371, | |
| "sampling/sampling_logp_difference/mean": 0.007549474947154522, | |
| "step": 203, | |
| "step_time": 29.916299866999907 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04583333572372794, | |
| "clip_ratio/high_mean": 0.013541667489334941, | |
| "clip_ratio/low_mean": 0.03132440650369972, | |
| "clip_ratio/low_min": 0.004166666883975267, | |
| "clip_ratio/region_mean": 0.04486607445869595, | |
| "entropy": 0.12076347460970283, | |
| "epoch": 0.00408, | |
| "grad_norm": 0.29815390706062317, | |
| "kl": 0.5736292470246553, | |
| "learning_rate": 9.999947733594757e-05, | |
| "loss": -0.0096, | |
| "step": 204, | |
| "step_time": 7.709945141001299 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.006250000325962901, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007291667046956718, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1016.0, | |
| "completions/max_terminated_length": 1016.0, | |
| "completions/mean_length": 948.3125, | |
| "completions/mean_terminated_length": 948.3125, | |
| "completions/min_length": 878.0, | |
| "completions/min_terminated_length": 878.0, | |
| "entropy": 0.10954847000539303, | |
| "epoch": 0.0041, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8904768228530884, | |
| "kl": 0.5102124018594623, | |
| "learning_rate": 9.999947109525271e-05, | |
| "loss": 0.0269, | |
| "num_tokens": 10676487.0, | |
| "reward": 7.509866237640381, | |
| "reward_std": 12.055532455444336, | |
| "rewards/rollout_reward_func/mean": 7.509865760803223, | |
| "rewards/rollout_reward_func/std": 12.425904273986816, | |
| "sampling/importance_sampling_ratio/max": 2.821709156036377, | |
| "sampling/importance_sampling_ratio/mean": 1.0446405410766602, | |
| "sampling/importance_sampling_ratio/min": 0.6838214993476868, | |
| "sampling/sampling_logp_difference/max": 0.6221010684967041, | |
| "sampling/sampling_logp_difference/mean": 0.007641012314707041, | |
| "step": 205, | |
| "step_time": 32.10779399100011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029166668187826872, | |
| "clip_ratio/high_mean": 0.008333333767950535, | |
| "clip_ratio/low_mean": 0.0238932310603559, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.032226564711891115, | |
| "entropy": 0.09176747733727098, | |
| "epoch": 0.00412, | |
| "grad_norm": 0.5064001083374023, | |
| "kl": 0.6276722047477961, | |
| "learning_rate": 9.999946481752144e-05, | |
| "loss": 0.0257, | |
| "step": 206, | |
| "step_time": 8.04664100899663 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1032.0, | |
| "completions/max_terminated_length": 1032.0, | |
| "completions/mean_length": 948.5, | |
| "completions/mean_terminated_length": 948.5, | |
| "completions/min_length": 695.0, | |
| "completions/min_terminated_length": 695.0, | |
| "entropy": 0.0839753916952759, | |
| "epoch": 0.00414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6526506543159485, | |
| "kl": 0.5436345608904958, | |
| "learning_rate": 9.999945850275377e-05, | |
| "loss": -0.0066, | |
| "num_tokens": 10788398.0, | |
| "reward": 4.734495639801025, | |
| "reward_std": 13.251731872558594, | |
| "rewards/rollout_reward_func/mean": 4.734495639801025, | |
| "rewards/rollout_reward_func/std": 15.050627708435059, | |
| "sampling/importance_sampling_ratio/max": 1.249489426612854, | |
| "sampling/importance_sampling_ratio/mean": 1.0017802715301514, | |
| "sampling/importance_sampling_ratio/min": 0.5872460603713989, | |
| "sampling/sampling_logp_difference/max": 0.5519323348999023, | |
| "sampling/sampling_logp_difference/mean": 0.007509762421250343, | |
| "step": 207, | |
| "step_time": 30.502280216000145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.041666668839752674, | |
| "clip_ratio/high_mean": 0.010416667209938169, | |
| "clip_ratio/low_mean": 0.020126489107497036, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.030543157132342458, | |
| "entropy": 0.0718412920832634, | |
| "epoch": 0.00416, | |
| "grad_norm": 0.9516690969467163, | |
| "kl": 1.0864872355014086, | |
| "learning_rate": 9.999945215094969e-05, | |
| "loss": -0.0086, | |
| "step": 208, | |
| "step_time": 8.340999965001174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1014.0, | |
| "completions/max_terminated_length": 1014.0, | |
| "completions/mean_length": 948.453125, | |
| "completions/mean_terminated_length": 948.453125, | |
| "completions/min_length": 656.0, | |
| "completions/min_terminated_length": 656.0, | |
| "entropy": 0.0782642443664372, | |
| "epoch": 0.00418, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9959932565689087, | |
| "kl": 0.5945225208997726, | |
| "learning_rate": 9.99994457621092e-05, | |
| "loss": 0.0171, | |
| "num_tokens": 10900280.0, | |
| "reward": 9.700709342956543, | |
| "reward_std": 13.213409423828125, | |
| "rewards/rollout_reward_func/mean": 9.700709342956543, | |
| "rewards/rollout_reward_func/std": 14.225313186645508, | |
| "sampling/importance_sampling_ratio/max": 1.3207358121871948, | |
| "sampling/importance_sampling_ratio/mean": 0.968299150466919, | |
| "sampling/importance_sampling_ratio/min": 0.3971961438655853, | |
| "sampling/sampling_logp_difference/max": 0.8888199329376221, | |
| "sampling/sampling_logp_difference/mean": 0.00841559562832117, | |
| "step": 209, | |
| "step_time": 29.9366263410011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025000001303851604, | |
| "clip_ratio/high_mean": 0.006250000325962901, | |
| "clip_ratio/low_mean": 0.02656250144354999, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03281250165309757, | |
| "entropy": 0.07524953898973763, | |
| "epoch": 0.0042, | |
| "grad_norm": 0.22927281260490417, | |
| "kl": 0.5741278808563948, | |
| "learning_rate": 9.999943933623233e-05, | |
| "loss": 0.0142, | |
| "step": 210, | |
| "step_time": 8.610201505000987 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0011160714784637094, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00424107164144516, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1033.0, | |
| "completions/max_terminated_length": 1033.0, | |
| "completions/mean_length": 961.625, | |
| "completions/mean_terminated_length": 961.625, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "entropy": 0.08438117569312453, | |
| "epoch": 0.00422, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6056447625160217, | |
| "kl": 0.4765475448220968, | |
| "learning_rate": 9.999943287331907e-05, | |
| "loss": -0.0396, | |
| "num_tokens": 11013133.0, | |
| "reward": 6.143215179443359, | |
| "reward_std": 9.006479263305664, | |
| "rewards/rollout_reward_func/mean": 6.143215179443359, | |
| "rewards/rollout_reward_func/std": 10.255783081054688, | |
| "sampling/importance_sampling_ratio/max": 1.5546733140945435, | |
| "sampling/importance_sampling_ratio/mean": 0.9941245913505554, | |
| "sampling/importance_sampling_ratio/min": 0.5497701168060303, | |
| "sampling/sampling_logp_difference/max": 0.6002916693687439, | |
| "sampling/sampling_logp_difference/mean": 0.0072316620498895645, | |
| "step": 211, | |
| "step_time": 29.858850818000974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833334419876337, | |
| "clip_ratio/high_mean": 0.006250000325962901, | |
| "clip_ratio/low_mean": 0.01889881060924381, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02514881093520671, | |
| "entropy": 0.08111793245188892, | |
| "epoch": 0.00424, | |
| "grad_norm": 0.3952238857746124, | |
| "kl": 0.5354121858254075, | |
| "learning_rate": 9.999942637336943e-05, | |
| "loss": -0.0419, | |
| "step": 212, | |
| "step_time": 8.145115041997997 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0031250001629814506, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1046.0, | |
| "completions/max_terminated_length": 1046.0, | |
| "completions/mean_length": 974.1875, | |
| "completions/mean_terminated_length": 974.1875, | |
| "completions/min_length": 853.0, | |
| "completions/min_terminated_length": 853.0, | |
| "entropy": 0.08407697454094887, | |
| "epoch": 0.00426, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5582248568534851, | |
| "kl": 0.5609004180878401, | |
| "learning_rate": 9.999941983638342e-05, | |
| "loss": -0.0096, | |
| "num_tokens": 11126805.0, | |
| "reward": 7.366800308227539, | |
| "reward_std": 11.575126647949219, | |
| "rewards/rollout_reward_func/mean": 7.366800785064697, | |
| "rewards/rollout_reward_func/std": 12.478679656982422, | |
| "sampling/importance_sampling_ratio/max": 1.7624305486679077, | |
| "sampling/importance_sampling_ratio/mean": 1.0073318481445312, | |
| "sampling/importance_sampling_ratio/min": 0.5805040001869202, | |
| "sampling/sampling_logp_difference/max": 0.5259637832641602, | |
| "sampling/sampling_logp_difference/mean": 0.007181447930634022, | |
| "step": 213, | |
| "step_time": 30.68919447299777 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.005208333721384406, | |
| "clip_ratio/low_mean": 0.02083333453629166, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026041668257676065, | |
| "entropy": 0.08252408797852695, | |
| "epoch": 0.00428, | |
| "grad_norm": 0.4972332715988159, | |
| "kl": 0.8270881623029709, | |
| "learning_rate": 9.999941326236106e-05, | |
| "loss": -0.0102, | |
| "step": 214, | |
| "step_time": 8.636868058998516 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1021.0, | |
| "completions/max_terminated_length": 1021.0, | |
| "completions/mean_length": 947.234375, | |
| "completions/mean_terminated_length": 947.234375, | |
| "completions/min_length": 827.0, | |
| "completions/min_terminated_length": 827.0, | |
| "entropy": 0.09235736005939543, | |
| "epoch": 0.0043, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7149195075035095, | |
| "kl": 0.6700677536427975, | |
| "learning_rate": 9.999940665130233e-05, | |
| "loss": 0.0269, | |
| "num_tokens": 11238594.0, | |
| "reward": 8.236663818359375, | |
| "reward_std": 12.342934608459473, | |
| "rewards/rollout_reward_func/mean": 8.236662864685059, | |
| "rewards/rollout_reward_func/std": 13.346291542053223, | |
| "sampling/importance_sampling_ratio/max": 1.3215135335922241, | |
| "sampling/importance_sampling_ratio/mean": 1.0117213726043701, | |
| "sampling/importance_sampling_ratio/min": 0.607474684715271, | |
| "sampling/sampling_logp_difference/max": 0.3575429916381836, | |
| "sampling/sampling_logp_difference/mean": 0.00792029220610857, | |
| "step": 215, | |
| "step_time": 30.079993358000138 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03333333507180214, | |
| "clip_ratio/high_mean": 0.009375000605359674, | |
| "clip_ratio/low_mean": 0.03020833502523601, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.039583335630595684, | |
| "entropy": 0.09003956150263548, | |
| "epoch": 0.00432, | |
| "grad_norm": 0.22825075685977936, | |
| "kl": 0.7063372246921062, | |
| "learning_rate": 9.999940000320725e-05, | |
| "loss": 0.0204, | |
| "step": 216, | |
| "step_time": 8.830438550000508 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1038.0, | |
| "completions/max_terminated_length": 1038.0, | |
| "completions/mean_length": 973.421875, | |
| "completions/mean_terminated_length": 973.421875, | |
| "completions/min_length": 899.0, | |
| "completions/min_terminated_length": 899.0, | |
| "entropy": 0.08580271410755813, | |
| "epoch": 0.00434, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.544740617275238, | |
| "kl": 0.8732884284108877, | |
| "learning_rate": 9.999939331807582e-05, | |
| "loss": -0.0038, | |
| "num_tokens": 11352163.0, | |
| "reward": 6.939154148101807, | |
| "reward_std": 12.035371780395508, | |
| "rewards/rollout_reward_func/mean": 6.939153671264648, | |
| "rewards/rollout_reward_func/std": 12.4366455078125, | |
| "sampling/importance_sampling_ratio/max": 1.316995620727539, | |
| "sampling/importance_sampling_ratio/mean": 1.0068674087524414, | |
| "sampling/importance_sampling_ratio/min": 0.7823165059089661, | |
| "sampling/sampling_logp_difference/max": 0.2636311650276184, | |
| "sampling/sampling_logp_difference/mean": 0.0060178861021995544, | |
| "step": 217, | |
| "step_time": 30.360063980000632 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025000001303851604, | |
| "clip_ratio/high_mean": 0.006250000325962901, | |
| "clip_ratio/low_mean": 0.018750000977888703, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025000001420266926, | |
| "entropy": 0.08710528793744743, | |
| "epoch": 0.00436, | |
| "grad_norm": 0.38059887290000916, | |
| "kl": 0.908846540376544, | |
| "learning_rate": 9.999938659590807e-05, | |
| "loss": -0.0104, | |
| "step": 218, | |
| "step_time": 7.607007630000226 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0030598959419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006184896221384406, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1034.0, | |
| "completions/max_terminated_length": 1034.0, | |
| "completions/mean_length": 963.703125, | |
| "completions/mean_terminated_length": 963.703125, | |
| "completions/min_length": 890.0, | |
| "completions/min_terminated_length": 890.0, | |
| "entropy": 0.09868060098960996, | |
| "epoch": 0.00438, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.48504236340522766, | |
| "kl": 0.5343823749572039, | |
| "learning_rate": 9.999937983670398e-05, | |
| "loss": 0.0194, | |
| "num_tokens": 11465042.0, | |
| "reward": 6.008334636688232, | |
| "reward_std": 13.721019744873047, | |
| "rewards/rollout_reward_func/mean": 6.008334636688232, | |
| "rewards/rollout_reward_func/std": 14.5517578125, | |
| "sampling/importance_sampling_ratio/max": 1.471420407295227, | |
| "sampling/importance_sampling_ratio/mean": 0.9755445718765259, | |
| "sampling/importance_sampling_ratio/min": 0.5715925097465515, | |
| "sampling/sampling_logp_difference/max": 0.46605920791625977, | |
| "sampling/sampling_logp_difference/mean": 0.008881919085979462, | |
| "step": 219, | |
| "step_time": 31.71598935200018 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03750000195577741, | |
| "clip_ratio/high_mean": 0.010416667209938169, | |
| "clip_ratio/low_mean": 0.022851563524454832, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.033268230734393, | |
| "entropy": 0.09918246325105429, | |
| "epoch": 0.0044, | |
| "grad_norm": 0.22379587590694427, | |
| "kl": 0.5816311649978161, | |
| "learning_rate": 9.999937304046355e-05, | |
| "loss": 0.0147, | |
| "step": 220, | |
| "step_time": 8.254640479001864 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005208333604969084, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1030.0, | |
| "completions/max_terminated_length": 1030.0, | |
| "completions/mean_length": 955.53125, | |
| "completions/mean_terminated_length": 955.53125, | |
| "completions/min_length": 894.0, | |
| "completions/min_terminated_length": 894.0, | |
| "entropy": 0.09134439891204238, | |
| "epoch": 0.00442, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7109507322311401, | |
| "kl": 0.6976822856813669, | |
| "learning_rate": 9.999936620718681e-05, | |
| "loss": 0.0063, | |
| "num_tokens": 11577407.0, | |
| "reward": 7.297264099121094, | |
| "reward_std": 9.222230911254883, | |
| "rewards/rollout_reward_func/mean": 7.297264575958252, | |
| "rewards/rollout_reward_func/std": 10.19138240814209, | |
| "sampling/importance_sampling_ratio/max": 1.4536468982696533, | |
| "sampling/importance_sampling_ratio/mean": 0.9984610080718994, | |
| "sampling/importance_sampling_ratio/min": 0.7001582384109497, | |
| "sampling/sampling_logp_difference/max": 0.37113046646118164, | |
| "sampling/sampling_logp_difference/mean": 0.006036648992449045, | |
| "step": 221, | |
| "step_time": 29.85640163500102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03333333507180214, | |
| "clip_ratio/high_mean": 0.009375000488944352, | |
| "clip_ratio/low_mean": 0.014583334210328758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02395833469927311, | |
| "entropy": 0.09828592231497169, | |
| "epoch": 0.00444, | |
| "grad_norm": 1.0275782346725464, | |
| "kl": 0.5333473347127438, | |
| "learning_rate": 9.999935933687375e-05, | |
| "loss": 0.0064, | |
| "step": 222, | |
| "step_time": 8.916792385998633 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 1024.0, | |
| "completions/mean_length": 955.5625, | |
| "completions/mean_terminated_length": 955.5625, | |
| "completions/min_length": 864.0, | |
| "completions/min_terminated_length": 864.0, | |
| "entropy": 0.11322583490982652, | |
| "epoch": 0.00446, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6080738306045532, | |
| "kl": 0.4430428724735975, | |
| "learning_rate": 9.999935242952441e-05, | |
| "loss": 0.0136, | |
| "num_tokens": 11689757.0, | |
| "reward": 7.010858535766602, | |
| "reward_std": 12.169811248779297, | |
| "rewards/rollout_reward_func/mean": 7.010858535766602, | |
| "rewards/rollout_reward_func/std": 12.808332443237305, | |
| "sampling/importance_sampling_ratio/max": 1.297809362411499, | |
| "sampling/importance_sampling_ratio/mean": 0.9824950695037842, | |
| "sampling/importance_sampling_ratio/min": 0.6718153953552246, | |
| "sampling/sampling_logp_difference/max": 0.3088874816894531, | |
| "sampling/sampling_logp_difference/mean": 0.007251087576150894, | |
| "step": 223, | |
| "step_time": 31.12233561499943 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05000000260770321, | |
| "clip_ratio/high_mean": 0.01770833437331021, | |
| "clip_ratio/low_mean": 0.02285156410653144, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04055989871267229, | |
| "entropy": 0.11228394089266658, | |
| "epoch": 0.00448, | |
| "grad_norm": 0.5497627258300781, | |
| "kl": 0.6058794800192118, | |
| "learning_rate": 9.999934548513874e-05, | |
| "loss": 0.0127, | |
| "step": 224, | |
| "step_time": 8.354907415001435 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0031250001629814506, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1039.0, | |
| "completions/max_terminated_length": 1039.0, | |
| "completions/mean_length": 941.9375, | |
| "completions/mean_terminated_length": 941.9375, | |
| "completions/min_length": 811.0, | |
| "completions/min_terminated_length": 811.0, | |
| "entropy": 0.10548029001802206, | |
| "epoch": 0.0045, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5894492864608765, | |
| "kl": 0.5073134936392307, | |
| "learning_rate": 9.999933850371681e-05, | |
| "loss": 0.0086, | |
| "num_tokens": 11801157.0, | |
| "reward": 6.051244258880615, | |
| "reward_std": 9.136930465698242, | |
| "rewards/rollout_reward_func/mean": 6.051244258880615, | |
| "rewards/rollout_reward_func/std": 9.732189178466797, | |
| "sampling/importance_sampling_ratio/max": 1.4082491397857666, | |
| "sampling/importance_sampling_ratio/mean": 0.9974700212478638, | |
| "sampling/importance_sampling_ratio/min": 0.6021063923835754, | |
| "sampling/sampling_logp_difference/max": 0.5747667551040649, | |
| "sampling/sampling_logp_difference/mean": 0.0068025123327970505, | |
| "step": 225, | |
| "step_time": 31.567075909998493 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025000001303851604, | |
| "clip_ratio/high_mean": 0.006250000325962901, | |
| "clip_ratio/low_mean": 0.027083334745839238, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.033333335421048105, | |
| "entropy": 0.10574616026133299, | |
| "epoch": 0.00452, | |
| "grad_norm": 0.2798631489276886, | |
| "kl": 0.7715174313634634, | |
| "learning_rate": 9.999933148525857e-05, | |
| "loss": 0.007, | |
| "step": 226, | |
| "step_time": 7.815335610000147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.0031250001629814506, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005208333604969084, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1017.0, | |
| "completions/max_terminated_length": 1017.0, | |
| "completions/mean_length": 959.5625, | |
| "completions/mean_terminated_length": 959.5625, | |
| "completions/min_length": 894.0, | |
| "completions/min_terminated_length": 894.0, | |
| "entropy": 0.11399172944948077, | |
| "epoch": 0.00454, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.568816602230072, | |
| "kl": 0.5976129788905382, | |
| "learning_rate": 9.999932442976408e-05, | |
| "loss": -0.0166, | |
| "num_tokens": 11913755.0, | |
| "reward": 8.459915161132812, | |
| "reward_std": 15.611612319946289, | |
| "rewards/rollout_reward_func/mean": 8.459915161132812, | |
| "rewards/rollout_reward_func/std": 15.882699012756348, | |
| "sampling/importance_sampling_ratio/max": 1.710551381111145, | |
| "sampling/importance_sampling_ratio/mean": 1.017797827720642, | |
| "sampling/importance_sampling_ratio/min": 0.7439659833908081, | |
| "sampling/sampling_logp_difference/max": 0.36053359508514404, | |
| "sampling/sampling_logp_difference/mean": 0.008504325523972511, | |
| "step": 227, | |
| "step_time": 31.015096133000043 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03750000195577741, | |
| "clip_ratio/high_mean": 0.013541667489334941, | |
| "clip_ratio/low_mean": 0.020833334769122303, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03437500272411853, | |
| "entropy": 0.11824841611087322, | |
| "epoch": 0.00456, | |
| "grad_norm": 0.31711068749427795, | |
| "kl": 0.6181838270276785, | |
| "learning_rate": 9.999931733723329e-05, | |
| "loss": -0.0224, | |
| "step": 228, | |
| "step_time": 8.539993245000005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1029.0, | |
| "completions/max_terminated_length": 1029.0, | |
| "completions/mean_length": 949.875, | |
| "completions/mean_terminated_length": 949.875, | |
| "completions/min_length": 829.0, | |
| "completions/min_terminated_length": 829.0, | |
| "entropy": 0.11835443088784814, | |
| "epoch": 0.00458, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.510497510433197, | |
| "kl": 0.562442360445857, | |
| "learning_rate": 9.999931020766625e-05, | |
| "loss": -0.0151, | |
| "num_tokens": 12025731.0, | |
| "reward": 7.8150177001953125, | |
| "reward_std": 10.93730640411377, | |
| "rewards/rollout_reward_func/mean": 7.8150177001953125, | |
| "rewards/rollout_reward_func/std": 12.047165870666504, | |
| "sampling/importance_sampling_ratio/max": 1.8081343173980713, | |
| "sampling/importance_sampling_ratio/mean": 1.0230156183242798, | |
| "sampling/importance_sampling_ratio/min": 0.5872366428375244, | |
| "sampling/sampling_logp_difference/max": 0.5174302458763123, | |
| "sampling/sampling_logp_difference/mean": 0.008099589496850967, | |
| "step": 229, | |
| "step_time": 30.196818825002993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029166668187826872, | |
| "clip_ratio/high_mean": 0.007291667046956718, | |
| "clip_ratio/low_mean": 0.028125001466833055, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03541666886303574, | |
| "entropy": 0.12235437287017703, | |
| "epoch": 0.0046, | |
| "grad_norm": 0.6880154013633728, | |
| "kl": 0.5698418729007244, | |
| "learning_rate": 9.999930304106295e-05, | |
| "loss": -0.0198, | |
| "step": 230, | |
| "step_time": 9.264137213997856 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01666666753590107, | |
| "clip_ratio/high_mean": 0.004166666883975267, | |
| "clip_ratio/low_mean": 0.0031250001629814506, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007291667046956718, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1022.0, | |
| "completions/max_terminated_length": 1022.0, | |
| "completions/mean_length": 953.078125, | |
| "completions/mean_terminated_length": 953.078125, | |
| "completions/min_length": 641.0, | |
| "completions/min_terminated_length": 641.0, | |
| "entropy": 0.10955408262088895, | |
| "epoch": 0.00462, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4517523944377899, | |
| "kl": 0.562993137165904, | |
| "learning_rate": 9.99992958374234e-05, | |
| "loss": 0.0172, | |
| "num_tokens": 12137928.0, | |
| "reward": 6.407680034637451, | |
| "reward_std": 12.907535552978516, | |
| "rewards/rollout_reward_func/mean": 6.407680034637451, | |
| "rewards/rollout_reward_func/std": 14.238213539123535, | |
| "sampling/importance_sampling_ratio/max": 1.3800395727157593, | |
| "sampling/importance_sampling_ratio/mean": 0.989479124546051, | |
| "sampling/importance_sampling_ratio/min": 0.5886368155479431, | |
| "sampling/sampling_logp_difference/max": 0.4858388900756836, | |
| "sampling/sampling_logp_difference/mean": 0.007202588953077793, | |
| "step": 231, | |
| "step_time": 30.985224181000376 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04583333572372794, | |
| "clip_ratio/high_mean": 0.012500000768341124, | |
| "clip_ratio/low_mean": 0.023177084513008595, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03567708551418036, | |
| "entropy": 0.09367383690550923, | |
| "epoch": 0.00464, | |
| "grad_norm": 0.3370003402233124, | |
| "kl": 0.5898754354566336, | |
| "learning_rate": 9.99992885967476e-05, | |
| "loss": 0.0141, | |
| "step": 232, | |
| "step_time": 7.848031210001864 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0031250001629814506, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005208333604969084, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1034.0, | |
| "completions/max_terminated_length": 1034.0, | |
| "completions/mean_length": 956.953125, | |
| "completions/mean_terminated_length": 956.953125, | |
| "completions/min_length": 643.0, | |
| "completions/min_terminated_length": 643.0, | |
| "entropy": 0.09109799051657319, | |
| "epoch": 0.00466, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9314411878585815, | |
| "kl": 0.6043513156473637, | |
| "learning_rate": 9.999928131903557e-05, | |
| "loss": -0.0214, | |
| "num_tokens": 12250409.0, | |
| "reward": 4.949494361877441, | |
| "reward_std": 13.414144515991211, | |
| "rewards/rollout_reward_func/mean": 4.949494361877441, | |
| "rewards/rollout_reward_func/std": 14.449867248535156, | |
| "sampling/importance_sampling_ratio/max": 1.7543931007385254, | |
| "sampling/importance_sampling_ratio/mean": 1.0081617832183838, | |
| "sampling/importance_sampling_ratio/min": 0.7344788908958435, | |
| "sampling/sampling_logp_difference/max": 0.40094685554504395, | |
| "sampling/sampling_logp_difference/mean": 0.007154828868806362, | |
| "step": 233, | |
| "step_time": 31.97192203099803 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06250000279396772, | |
| "clip_ratio/high_mean": 0.018750001094304025, | |
| "clip_ratio/low_mean": 0.026041668141260743, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04479166946839541, | |
| "entropy": 0.07302908715792, | |
| "epoch": 0.00468, | |
| "grad_norm": 0.6798368692398071, | |
| "kl": 1.063211616128683, | |
| "learning_rate": 9.999927400428733e-05, | |
| "loss": -0.0247, | |
| "step": 234, | |
| "step_time": 8.365730943999552 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1030.0, | |
| "completions/max_terminated_length": 1030.0, | |
| "completions/mean_length": 939.296875, | |
| "completions/mean_terminated_length": 939.296875, | |
| "completions/min_length": 455.0, | |
| "completions/min_terminated_length": 455.0, | |
| "entropy": 0.08065455732867122, | |
| "epoch": 0.0047, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6933859586715698, | |
| "kl": 0.5536066945642233, | |
| "learning_rate": 9.999926665250286e-05, | |
| "loss": -0.0262, | |
| "num_tokens": 12361673.0, | |
| "reward": 6.229083061218262, | |
| "reward_std": 13.382326126098633, | |
| "rewards/rollout_reward_func/mean": 6.229083061218262, | |
| "rewards/rollout_reward_func/std": 14.236706733703613, | |
| "sampling/importance_sampling_ratio/max": 1.6296361684799194, | |
| "sampling/importance_sampling_ratio/mean": 0.9904996752738953, | |
| "sampling/importance_sampling_ratio/min": 0.554724395275116, | |
| "sampling/sampling_logp_difference/max": 0.5841927528381348, | |
| "sampling/sampling_logp_difference/mean": 0.007334005553275347, | |
| "step": 235, | |
| "step_time": 30.556930122000267 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.041666668839752674, | |
| "clip_ratio/high_mean": 0.012500000768341124, | |
| "clip_ratio/low_mean": 0.023177084629423916, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03567708586342633, | |
| "entropy": 0.07718153693713248, | |
| "epoch": 0.00472, | |
| "grad_norm": 0.373847097158432, | |
| "kl": 0.7903371974825859, | |
| "learning_rate": 9.999925926368217e-05, | |
| "loss": -0.0281, | |
| "step": 236, | |
| "step_time": 8.405578448001506 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1029.0, | |
| "completions/max_terminated_length": 1029.0, | |
| "completions/mean_length": 969.71875, | |
| "completions/mean_terminated_length": 969.71875, | |
| "completions/min_length": 882.0, | |
| "completions/min_terminated_length": 882.0, | |
| "entropy": 0.09033584129065275, | |
| "epoch": 0.00474, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.672709584236145, | |
| "kl": 0.4807140491902828, | |
| "learning_rate": 9.999925183782528e-05, | |
| "loss": 0.0206, | |
| "num_tokens": 12475023.0, | |
| "reward": 7.968780517578125, | |
| "reward_std": 14.767425537109375, | |
| "rewards/rollout_reward_func/mean": 7.968780517578125, | |
| "rewards/rollout_reward_func/std": 15.451577186584473, | |
| "sampling/importance_sampling_ratio/max": 1.446393370628357, | |
| "sampling/importance_sampling_ratio/mean": 1.0078678131103516, | |
| "sampling/importance_sampling_ratio/min": 0.7536318898200989, | |
| "sampling/sampling_logp_difference/max": 0.36260342597961426, | |
| "sampling/sampling_logp_difference/mean": 0.006248952820897102, | |
| "step": 237, | |
| "step_time": 30.60275455199826 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.054166669491678476, | |
| "clip_ratio/high_mean": 0.01770833448972553, | |
| "clip_ratio/low_mean": 0.010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.028125002165324986, | |
| "entropy": 0.10236532241106033, | |
| "epoch": 0.00476, | |
| "grad_norm": 0.1718801110982895, | |
| "kl": 0.45644159242510796, | |
| "learning_rate": 9.999924437493219e-05, | |
| "loss": 0.0137, | |
| "step": 238, | |
| "step_time": 8.174696675001542 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004166666883975267, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1038.0, | |
| "completions/max_terminated_length": 1038.0, | |
| "completions/mean_length": 986.125, | |
| "completions/mean_terminated_length": 986.125, | |
| "completions/min_length": 910.0, | |
| "completions/min_terminated_length": 910.0, | |
| "entropy": 0.11793840350583196, | |
| "epoch": 0.00478, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.403202623128891, | |
| "kl": 0.5491157062351704, | |
| "learning_rate": 9.99992368750029e-05, | |
| "loss": 0.0205, | |
| "num_tokens": 12589470.0, | |
| "reward": 7.7764387130737305, | |
| "reward_std": 10.855308532714844, | |
| "rewards/rollout_reward_func/mean": 7.776438236236572, | |
| "rewards/rollout_reward_func/std": 11.845745086669922, | |
| "sampling/importance_sampling_ratio/max": 1.4125927686691284, | |
| "sampling/importance_sampling_ratio/mean": 1.0074553489685059, | |
| "sampling/importance_sampling_ratio/min": 0.6787428855895996, | |
| "sampling/sampling_logp_difference/max": 0.36117464303970337, | |
| "sampling/sampling_logp_difference/mean": 0.007153394166380167, | |
| "step": 239, | |
| "step_time": 31.446214477003195 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06614583590999246, | |
| "clip_ratio/high_mean": 0.02070312586147338, | |
| "clip_ratio/low_mean": 0.018750000977888703, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03945312695577741, | |
| "entropy": 0.12702699471265078, | |
| "epoch": 0.0048, | |
| "grad_norm": 0.28333526849746704, | |
| "kl": 0.5486433319747448, | |
| "learning_rate": 9.999922933803743e-05, | |
| "loss": 0.0157, | |
| "step": 240, | |
| "step_time": 8.18167577299937 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012500000651925802, | |
| "clip_ratio/high_mean": 0.004101562546566129, | |
| "clip_ratio/low_mean": 0.0015997024602256715, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0057012650067918, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1028.0, | |
| "completions/max_terminated_length": 1028.0, | |
| "completions/mean_length": 947.078125, | |
| "completions/mean_terminated_length": 947.078125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "entropy": 0.15819989750161767, | |
| "epoch": 0.00482, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5497854948043823, | |
| "kl": 0.6653936766088009, | |
| "learning_rate": 9.999922176403578e-05, | |
| "loss": 0.0274, | |
| "num_tokens": 12701387.0, | |
| "reward": 4.367884635925293, | |
| "reward_std": 14.958491325378418, | |
| "rewards/rollout_reward_func/mean": 4.367884635925293, | |
| "rewards/rollout_reward_func/std": 15.79384708404541, | |
| "sampling/importance_sampling_ratio/max": 1.7386236190795898, | |
| "sampling/importance_sampling_ratio/mean": 1.0037915706634521, | |
| "sampling/importance_sampling_ratio/min": 1.5100153958014693e-17, | |
| "sampling/sampling_logp_difference/max": 32.36700439453125, | |
| "sampling/sampling_logp_difference/mean": 0.050702136009931564, | |
| "step": 241, | |
| "step_time": 30.312135679001585 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0713541698642075, | |
| "clip_ratio/high_mean": 0.02304687607102096, | |
| "clip_ratio/low_mean": 0.019182722782716155, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.042229598737321794, | |
| "entropy": 0.1654459210112691, | |
| "epoch": 0.00484, | |
| "grad_norm": 0.2647717595100403, | |
| "kl": 0.6706695519387722, | |
| "learning_rate": 9.999921415299796e-05, | |
| "loss": 0.0208, | |
| "step": 242, | |
| "step_time": 8.75198459999956 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1034.0, | |
| "completions/max_terminated_length": 1034.0, | |
| "completions/mean_length": 975.015625, | |
| "completions/mean_terminated_length": 975.015625, | |
| "completions/min_length": 829.0, | |
| "completions/min_terminated_length": 829.0, | |
| "entropy": 0.1656077685765922, | |
| "epoch": 0.00486, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5207899808883667, | |
| "kl": 0.4836368393152952, | |
| "learning_rate": 9.999920650492399e-05, | |
| "loss": -0.0058, | |
| "num_tokens": 12815104.0, | |
| "reward": 8.474200248718262, | |
| "reward_std": 13.949186325073242, | |
| "rewards/rollout_reward_func/mean": 8.474200248718262, | |
| "rewards/rollout_reward_func/std": 15.287591934204102, | |
| "sampling/importance_sampling_ratio/max": 1.3931519985198975, | |
| "sampling/importance_sampling_ratio/mean": 0.9963239431381226, | |
| "sampling/importance_sampling_ratio/min": 7.17475301392767e-10, | |
| "sampling/sampling_logp_difference/max": 14.08260726928711, | |
| "sampling/sampling_logp_difference/mean": 0.0297236330807209, | |
| "step": 243, | |
| "step_time": 29.95098099400184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04583333572372794, | |
| "clip_ratio/high_mean": 0.01562500116415322, | |
| "clip_ratio/low_mean": 0.019308037008158863, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.034933038405142725, | |
| "entropy": 0.17500293161720037, | |
| "epoch": 0.00488, | |
| "grad_norm": 0.2083873599767685, | |
| "kl": 0.4789597373455763, | |
| "learning_rate": 9.999919881981386e-05, | |
| "loss": -0.0127, | |
| "step": 244, | |
| "step_time": 9.603725530998418 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0005580357392318547, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0005580357392318547, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1032.0, | |
| "completions/max_terminated_length": 1032.0, | |
| "completions/mean_length": 960.71875, | |
| "completions/mean_terminated_length": 960.71875, | |
| "completions/min_length": 870.0, | |
| "completions/min_terminated_length": 870.0, | |
| "entropy": 0.1825911095365882, | |
| "epoch": 0.0049, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5889225602149963, | |
| "kl": 0.9096355475485325, | |
| "learning_rate": 9.999919109766759e-05, | |
| "loss": 0.0086, | |
| "num_tokens": 12927807.0, | |
| "reward": 4.357412338256836, | |
| "reward_std": 10.907012939453125, | |
| "rewards/rollout_reward_func/mean": 4.357412338256836, | |
| "rewards/rollout_reward_func/std": 11.52568531036377, | |
| "sampling/importance_sampling_ratio/max": 1.709380030632019, | |
| "sampling/importance_sampling_ratio/mean": 1.0086195468902588, | |
| "sampling/importance_sampling_ratio/min": 0.7435536980628967, | |
| "sampling/sampling_logp_difference/max": 0.24706459045410156, | |
| "sampling/sampling_logp_difference/mean": 0.007025801111012697, | |
| "step": 245, | |
| "step_time": 31.461640581997926 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04583333572372794, | |
| "clip_ratio/high_mean": 0.014583334210328758, | |
| "clip_ratio/low_mean": 0.02460007555782795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.039183410233817995, | |
| "entropy": 0.1923405658453703, | |
| "epoch": 0.00492, | |
| "grad_norm": 0.2593821585178375, | |
| "kl": 0.6401933804154396, | |
| "learning_rate": 9.999918333848517e-05, | |
| "loss": -0.0009, | |
| "step": 246, | |
| "step_time": 7.859529026000018 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004464285913854837, | |
| "clip_ratio/high_mean": 0.0011160714784637094, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0011160714784637094, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1032.0, | |
| "completions/max_terminated_length": 1032.0, | |
| "completions/mean_length": 962.265625, | |
| "completions/mean_terminated_length": 962.265625, | |
| "completions/min_length": 699.0, | |
| "completions/min_terminated_length": 699.0, | |
| "entropy": 0.21913561783730984, | |
| "epoch": 0.00494, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5455278158187866, | |
| "kl": 0.5414405167102814, | |
| "learning_rate": 9.999917554226662e-05, | |
| "loss": 0.0071, | |
| "num_tokens": 13040672.0, | |
| "reward": 8.484970092773438, | |
| "reward_std": 13.802679061889648, | |
| "rewards/rollout_reward_func/mean": 8.484970092773438, | |
| "rewards/rollout_reward_func/std": 13.872236251831055, | |
| "sampling/importance_sampling_ratio/max": 1.3358259201049805, | |
| "sampling/importance_sampling_ratio/mean": 0.9915522336959839, | |
| "sampling/importance_sampling_ratio/min": 0.004653692711144686, | |
| "sampling/sampling_logp_difference/max": 4.521495819091797, | |
| "sampling/sampling_logp_difference/mean": 0.016463816165924072, | |
| "step": 247, | |
| "step_time": 32.296578387999034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07113095559179783, | |
| "clip_ratio/high_mean": 0.025074406410567462, | |
| "clip_ratio/low_mean": 0.03020833490882069, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05528274131938815, | |
| "entropy": 0.2346064280718565, | |
| "epoch": 0.00496, | |
| "grad_norm": 0.28457146883010864, | |
| "kl": 0.5182771291583776, | |
| "learning_rate": 9.999916770901196e-05, | |
| "loss": 0.0003, | |
| "step": 248, | |
| "step_time": 8.322071146998496 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008333333767950535, | |
| "clip_ratio/high_mean": 0.0020833334419876337, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1038.0, | |
| "completions/max_terminated_length": 1038.0, | |
| "completions/mean_length": 947.125, | |
| "completions/mean_terminated_length": 947.125, | |
| "completions/min_length": 887.0, | |
| "completions/min_terminated_length": 887.0, | |
| "entropy": 0.22256971709430218, | |
| "epoch": 0.00498, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4911050498485565, | |
| "kl": 0.5106718242168427, | |
| "learning_rate": 9.999915983872117e-05, | |
| "loss": 0.0185, | |
| "num_tokens": 13152426.0, | |
| "reward": 8.859310150146484, | |
| "reward_std": 14.251840591430664, | |
| "rewards/rollout_reward_func/mean": 8.859310150146484, | |
| "rewards/rollout_reward_func/std": 15.995503425598145, | |
| "sampling/importance_sampling_ratio/max": 1.1877168416976929, | |
| "sampling/importance_sampling_ratio/mean": 1.0040578842163086, | |
| "sampling/importance_sampling_ratio/min": 0.7722747921943665, | |
| "sampling/sampling_logp_difference/max": 0.2686450481414795, | |
| "sampling/sampling_logp_difference/mean": 0.008781258016824722, | |
| "step": 249, | |
| "step_time": 31.257320647998313 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.058333335909992456, | |
| "clip_ratio/high_mean": 0.019791667815297842, | |
| "clip_ratio/low_mean": 0.02544642984867096, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04523809743113816, | |
| "entropy": 0.22759789694100618, | |
| "epoch": 0.005, | |
| "grad_norm": 0.32039502263069153, | |
| "kl": 0.5074543356895447, | |
| "learning_rate": 9.999915193139428e-05, | |
| "loss": 0.0067, | |
| "step": 250, | |
| "step_time": 8.727711221999925 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0010416667209938169, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1033.0, | |
| "completions/max_terminated_length": 1033.0, | |
| "completions/mean_length": 972.3125, | |
| "completions/mean_terminated_length": 972.3125, | |
| "completions/min_length": 885.0, | |
| "completions/min_terminated_length": 885.0, | |
| "entropy": 0.2637898661196232, | |
| "epoch": 0.00502, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6115661263465881, | |
| "kl": 0.5041398257017136, | |
| "learning_rate": 9.999914398703127e-05, | |
| "loss": 0.0222, | |
| "num_tokens": 13265926.0, | |
| "reward": 8.36276626586914, | |
| "reward_std": 12.634754180908203, | |
| "rewards/rollout_reward_func/mean": 8.36276626586914, | |
| "rewards/rollout_reward_func/std": 13.79938793182373, | |
| "sampling/importance_sampling_ratio/max": 1.3860008716583252, | |
| "sampling/importance_sampling_ratio/mean": 0.9989358186721802, | |
| "sampling/importance_sampling_ratio/min": 0.6789365410804749, | |
| "sampling/sampling_logp_difference/max": 0.4403858184814453, | |
| "sampling/sampling_logp_difference/mean": 0.011640775017440319, | |
| "step": 251, | |
| "step_time": 30.78323078500125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08333333721384406, | |
| "clip_ratio/high_mean": 0.026041668141260743, | |
| "clip_ratio/low_mean": 0.024479167768731713, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05052083625923842, | |
| "entropy": 0.25762353744357824, | |
| "epoch": 0.00504, | |
| "grad_norm": 0.33068326115608215, | |
| "kl": 0.5315965916961432, | |
| "learning_rate": 9.99991360056322e-05, | |
| "loss": 0.0108, | |
| "step": 252, | |
| "step_time": 8.417674423999415 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0010416667209938169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1039.0, | |
| "completions/max_terminated_length": 1039.0, | |
| "completions/mean_length": 962.484375, | |
| "completions/mean_terminated_length": 962.484375, | |
| "completions/min_length": 640.0, | |
| "completions/min_terminated_length": 640.0, | |
| "entropy": 0.2320685014128685, | |
| "epoch": 0.00506, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.606693685054779, | |
| "kl": 0.5243742056190968, | |
| "learning_rate": 9.999912798719702e-05, | |
| "loss": 0.0154, | |
| "num_tokens": 13378838.0, | |
| "reward": 5.471320152282715, | |
| "reward_std": 16.305179595947266, | |
| "rewards/rollout_reward_func/mean": 5.471320629119873, | |
| "rewards/rollout_reward_func/std": 16.513338088989258, | |
| "sampling/importance_sampling_ratio/max": 1.3763768672943115, | |
| "sampling/importance_sampling_ratio/mean": 0.9975243806838989, | |
| "sampling/importance_sampling_ratio/min": 0.706875205039978, | |
| "sampling/sampling_logp_difference/max": 0.28901320695877075, | |
| "sampling/sampling_logp_difference/mean": 0.009924216195940971, | |
| "step": 253, | |
| "step_time": 31.570553302000008 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.058333336375653744, | |
| "clip_ratio/high_mean": 0.018750001094304025, | |
| "clip_ratio/low_mean": 0.026041667792014778, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.044791669119149446, | |
| "entropy": 0.21940706949681044, | |
| "epoch": 0.00508, | |
| "grad_norm": 0.3425885736942291, | |
| "kl": 0.6501965597271919, | |
| "learning_rate": 9.999911993172577e-05, | |
| "loss": 0.0077, | |
| "step": 254, | |
| "step_time": 8.264053588000024 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004166666883975267, | |
| "clip_ratio/high_mean": 0.0010416667209938169, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250001629814506, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1031.0, | |
| "completions/max_terminated_length": 1031.0, | |
| "completions/mean_length": 953.46875, | |
| "completions/mean_terminated_length": 953.46875, | |
| "completions/min_length": 900.0, | |
| "completions/min_terminated_length": 900.0, | |
| "entropy": 0.21140480507165194, | |
| "epoch": 0.0051, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5503631234169006, | |
| "kl": 0.5327232480049133, | |
| "learning_rate": 9.999911183921846e-05, | |
| "loss": 0.0038, | |
| "num_tokens": 13491042.0, | |
| "reward": 8.259774208068848, | |
| "reward_std": 10.848722457885742, | |
| "rewards/rollout_reward_func/mean": 8.259774208068848, | |
| "rewards/rollout_reward_func/std": 11.306256294250488, | |
| "sampling/importance_sampling_ratio/max": 1.4545994997024536, | |
| "sampling/importance_sampling_ratio/mean": 0.9899890422821045, | |
| "sampling/importance_sampling_ratio/min": 0.6251944303512573, | |
| "sampling/sampling_logp_difference/max": 0.42076706886291504, | |
| "sampling/sampling_logp_difference/mean": 0.01122802309691906, | |
| "step": 255, | |
| "step_time": 31.768314117000045 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05476190708577633, | |
| "clip_ratio/high_mean": 0.022023811121471226, | |
| "clip_ratio/low_mean": 0.022916668327525258, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04494047968182713, | |
| "entropy": 0.21271847933530807, | |
| "epoch": 0.00512, | |
| "grad_norm": 1.165165662765503, | |
| "kl": 0.5640581175684929, | |
| "learning_rate": 9.999910370967507e-05, | |
| "loss": -0.0008, | |
| "step": 256, | |
| "step_time": 8.655397303001337 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0008680555620230734, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1343.0, | |
| "completions/max_terminated_length": 1343.0, | |
| "completions/mean_length": 1241.921875, | |
| "completions/mean_terminated_length": 1241.921875, | |
| "completions/min_length": 1084.0, | |
| "completions/min_terminated_length": 1084.0, | |
| "entropy": 0.2426544101908803, | |
| "epoch": 0.00514, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8530539870262146, | |
| "kl": 0.5651203468441963, | |
| "learning_rate": 9.999909554309565e-05, | |
| "loss": 0.0047, | |
| "num_tokens": 13621717.0, | |
| "reward": 4.329623699188232, | |
| "reward_std": 14.394445419311523, | |
| "rewards/rollout_reward_func/mean": 4.329623222351074, | |
| "rewards/rollout_reward_func/std": 14.93822193145752, | |
| "sampling/importance_sampling_ratio/max": 1.2321135997772217, | |
| "sampling/importance_sampling_ratio/mean": 0.9581992626190186, | |
| "sampling/importance_sampling_ratio/min": 0.2660026550292969, | |
| "sampling/sampling_logp_difference/max": 1.2229857444763184, | |
| "sampling/sampling_logp_difference/mean": 0.013221165165305138, | |
| "step": 257, | |
| "step_time": 37.82865672800108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06597222317941487, | |
| "clip_ratio/high_mean": 0.027732091082725674, | |
| "clip_ratio/low_mean": 0.038194445020053536, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.06592653610277921, | |
| "entropy": 0.2270987592637539, | |
| "epoch": 0.00516, | |
| "grad_norm": 0.43574994802474976, | |
| "kl": 0.6589642316102982, | |
| "learning_rate": 9.999908733948017e-05, | |
| "loss": -0.0093, | |
| "step": 258, | |
| "step_time": 10.512357729995529 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1366.0, | |
| "completions/max_terminated_length": 1366.0, | |
| "completions/mean_length": 1250.96875, | |
| "completions/mean_terminated_length": 1250.96875, | |
| "completions/min_length": 735.0, | |
| "completions/min_terminated_length": 735.0, | |
| "entropy": 0.213697855360806, | |
| "epoch": 0.00518, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6505308747291565, | |
| "kl": 0.5941296126693487, | |
| "learning_rate": 9.999907909882866e-05, | |
| "loss": -0.0204, | |
| "num_tokens": 13753091.0, | |
| "reward": 6.001728057861328, | |
| "reward_std": 15.827871322631836, | |
| "rewards/rollout_reward_func/mean": 6.001728057861328, | |
| "rewards/rollout_reward_func/std": 16.02460479736328, | |
| "sampling/importance_sampling_ratio/max": 1.3350346088409424, | |
| "sampling/importance_sampling_ratio/mean": 0.9674654006958008, | |
| "sampling/importance_sampling_ratio/min": 0.5470981001853943, | |
| "sampling/sampling_logp_difference/max": 0.5512038469314575, | |
| "sampling/sampling_logp_difference/mean": 0.011880462057888508, | |
| "step": 259, | |
| "step_time": 38.35317581399886 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05208333395421505, | |
| "clip_ratio/high_mean": 0.015625000174622983, | |
| "clip_ratio/low_mean": 0.04037990275537595, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.056004903570283204, | |
| "entropy": 0.1957033844664693, | |
| "epoch": 0.0052, | |
| "grad_norm": 0.45430490374565125, | |
| "kl": 0.781089099124074, | |
| "learning_rate": 9.999907082114112e-05, | |
| "loss": -0.0313, | |
| "step": 260, | |
| "step_time": 9.020615039000404 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.016812865156680346, | |
| "clip_ratio/high_mean": 0.004203216289170086, | |
| "clip_ratio/low_mean": 0.0026041666860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006807382975239307, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1335.0, | |
| "completions/max_terminated_length": 1335.0, | |
| "completions/mean_length": 1213.078125, | |
| "completions/mean_terminated_length": 1213.078125, | |
| "completions/min_length": 626.0, | |
| "completions/min_terminated_length": 626.0, | |
| "entropy": 0.18847014661878347, | |
| "epoch": 0.00522, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6835371851921082, | |
| "kl": 0.542348800227046, | |
| "learning_rate": 9.999906250641758e-05, | |
| "loss": 0.0145, | |
| "num_tokens": 13881882.0, | |
| "reward": 5.304417610168457, | |
| "reward_std": 14.105676651000977, | |
| "rewards/rollout_reward_func/mean": 5.304417133331299, | |
| "rewards/rollout_reward_func/std": 14.791868209838867, | |
| "sampling/importance_sampling_ratio/max": 1.3720983266830444, | |
| "sampling/importance_sampling_ratio/mean": 0.9722362756729126, | |
| "sampling/importance_sampling_ratio/min": 8.055465437370129e-20, | |
| "sampling/sampling_logp_difference/max": 38.39814376831055, | |
| "sampling/sampling_logp_difference/mean": 0.05052501708269119, | |
| "step": 261, | |
| "step_time": 39.094199752998065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05559855583123863, | |
| "clip_ratio/high_mean": 0.018239916767925024, | |
| "clip_ratio/low_mean": 0.02690972271375358, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.04514963936526328, | |
| "entropy": 0.1835553077980876, | |
| "epoch": 0.00524, | |
| "grad_norm": 0.266696572303772, | |
| "kl": 0.5767297390848398, | |
| "learning_rate": 9.9999054154658e-05, | |
| "loss": 0.0025, | |
| "step": 262, | |
| "step_time": 9.272368914001163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0026041666860692203, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1344.0, | |
| "completions/max_terminated_length": 1344.0, | |
| "completions/mean_length": 1241.4375, | |
| "completions/mean_terminated_length": 1241.4375, | |
| "completions/min_length": 902.0, | |
| "completions/min_terminated_length": 902.0, | |
| "entropy": 0.1920191366225481, | |
| "epoch": 0.00526, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8278535604476929, | |
| "kl": 0.5627734903246164, | |
| "learning_rate": 9.999904576586242e-05, | |
| "loss": -0.0123, | |
| "num_tokens": 14012567.0, | |
| "reward": 3.131194591522217, | |
| "reward_std": 12.649508476257324, | |
| "rewards/rollout_reward_func/mean": 3.131195068359375, | |
| "rewards/rollout_reward_func/std": 12.768006324768066, | |
| "sampling/importance_sampling_ratio/max": 1.5249695777893066, | |
| "sampling/importance_sampling_ratio/mean": 1.0084636211395264, | |
| "sampling/importance_sampling_ratio/min": 0.6291685700416565, | |
| "sampling/sampling_logp_difference/max": 0.48067259788513184, | |
| "sampling/sampling_logp_difference/mean": 0.011031190864741802, | |
| "step": 263, | |
| "step_time": 37.94770654900185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06597222364507616, | |
| "clip_ratio/high_mean": 0.02170138922519982, | |
| "clip_ratio/low_mean": 0.027777778508607298, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04947916854871437, | |
| "entropy": 0.1821621311828494, | |
| "epoch": 0.00528, | |
| "grad_norm": 0.30830591917037964, | |
| "kl": 0.6204142663627863, | |
| "learning_rate": 9.999903734003084e-05, | |
| "loss": -0.0238, | |
| "step": 264, | |
| "step_time": 9.788196208000045 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0026041666860692203, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004340277810115367, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1354.0, | |
| "completions/max_terminated_length": 1354.0, | |
| "completions/mean_length": 1215.875, | |
| "completions/mean_terminated_length": 1215.875, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "entropy": 0.18026093766093254, | |
| "epoch": 0.0053, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7985097765922546, | |
| "kl": 0.5250384621322155, | |
| "learning_rate": 9.999902887716329e-05, | |
| "loss": -0.0455, | |
| "num_tokens": 14141610.0, | |
| "reward": 2.849423408508301, | |
| "reward_std": 12.35162353515625, | |
| "rewards/rollout_reward_func/mean": 2.849423408508301, | |
| "rewards/rollout_reward_func/std": 12.910691261291504, | |
| "sampling/importance_sampling_ratio/max": 1.7354861497879028, | |
| "sampling/importance_sampling_ratio/mean": 0.9913997650146484, | |
| "sampling/importance_sampling_ratio/min": 0.53452068567276, | |
| "sampling/sampling_logp_difference/max": 0.5425161123275757, | |
| "sampling/sampling_logp_difference/mean": 0.012209449894726276, | |
| "step": 265, | |
| "step_time": 38.11352587100009 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06311274622566998, | |
| "clip_ratio/high_mean": 0.01925040880450979, | |
| "clip_ratio/low_mean": 0.030831291631329805, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.050081700494047254, | |
| "entropy": 0.1743807177990675, | |
| "epoch": 0.00532, | |
| "grad_norm": 0.9081993103027344, | |
| "kl": 1.4623642209917307, | |
| "learning_rate": 9.999902037725976e-05, | |
| "loss": -0.0483, | |
| "step": 266, | |
| "step_time": 9.7086338709978 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0026041666860692203, | |
| "clip_ratio/low_mean": 0.005259395460598171, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007863562146667391, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1349.0, | |
| "completions/max_terminated_length": 1349.0, | |
| "completions/mean_length": 1235.953125, | |
| "completions/mean_terminated_length": 1235.953125, | |
| "completions/min_length": 769.0, | |
| "completions/min_terminated_length": 769.0, | |
| "entropy": 0.1784328306093812, | |
| "epoch": 0.00534, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8215274810791016, | |
| "kl": 0.49557984061539173, | |
| "learning_rate": 9.999901184032026e-05, | |
| "loss": 0.0099, | |
| "num_tokens": 14271910.0, | |
| "reward": 6.570675849914551, | |
| "reward_std": 11.428293228149414, | |
| "rewards/rollout_reward_func/mean": 6.570675849914551, | |
| "rewards/rollout_reward_func/std": 11.919609069824219, | |
| "sampling/importance_sampling_ratio/max": 1.5103188753128052, | |
| "sampling/importance_sampling_ratio/mean": 1.018727421760559, | |
| "sampling/importance_sampling_ratio/min": 1.0843930725359919e-15, | |
| "sampling/sampling_logp_difference/max": 27.71844482421875, | |
| "sampling/sampling_logp_difference/mean": 0.04110131412744522, | |
| "step": 267, | |
| "step_time": 40.345479872002215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08261846494860947, | |
| "clip_ratio/high_mean": 0.025862949551083148, | |
| "clip_ratio/low_mean": 0.025904605397954583, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05176755564752966, | |
| "entropy": 0.18014734331518412, | |
| "epoch": 0.00536, | |
| "grad_norm": 0.3775624632835388, | |
| "kl": 0.5100179798901081, | |
| "learning_rate": 9.99990032663448e-05, | |
| "loss": -0.0005, | |
| "step": 268, | |
| "step_time": 8.8038962849987 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010620915098115802, | |
| "clip_ratio/high_mean": 0.0026552287745289505, | |
| "clip_ratio/low_mean": 0.001787173212505877, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0044424019870348275, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1348.0, | |
| "completions/max_terminated_length": 1348.0, | |
| "completions/mean_length": 1209.5625, | |
| "completions/mean_terminated_length": 1209.5625, | |
| "completions/min_length": 423.0, | |
| "completions/min_terminated_length": 423.0, | |
| "entropy": 0.19260858092457056, | |
| "epoch": 0.00538, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9969385266304016, | |
| "kl": 0.48063670098781586, | |
| "learning_rate": 9.999899465533337e-05, | |
| "loss": -0.0145, | |
| "num_tokens": 14400520.0, | |
| "reward": 4.870312690734863, | |
| "reward_std": 12.755669593811035, | |
| "rewards/rollout_reward_func/mean": 4.870312690734863, | |
| "rewards/rollout_reward_func/std": 12.786203384399414, | |
| "sampling/importance_sampling_ratio/max": 1.391904354095459, | |
| "sampling/importance_sampling_ratio/mean": 0.9837595224380493, | |
| "sampling/importance_sampling_ratio/min": 0.5111071467399597, | |
| "sampling/sampling_logp_difference/max": 0.6141395568847656, | |
| "sampling/sampling_logp_difference/mean": 0.012509889900684357, | |
| "step": 269, | |
| "step_time": 39.08969898599935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07679738639853895, | |
| "clip_ratio/high_mean": 0.025275735883042216, | |
| "clip_ratio/low_mean": 0.03416053985711187, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.05943627591477707, | |
| "entropy": 0.190420214086771, | |
| "epoch": 0.0054, | |
| "grad_norm": 2.133584499359131, | |
| "kl": 1.2963667679578066, | |
| "learning_rate": 9.999898600728599e-05, | |
| "loss": -0.0154, | |
| "step": 270, | |
| "step_time": 10.067689283000618 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1356.0, | |
| "completions/max_terminated_length": 1356.0, | |
| "completions/mean_length": 1241.09375, | |
| "completions/mean_terminated_length": 1241.09375, | |
| "completions/min_length": 641.0, | |
| "completions/min_terminated_length": 641.0, | |
| "entropy": 0.17894164565950632, | |
| "epoch": 0.00542, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.759871780872345, | |
| "kl": 0.4996040966361761, | |
| "learning_rate": 9.999897732220269e-05, | |
| "loss": -0.0452, | |
| "num_tokens": 14531215.0, | |
| "reward": 6.598260879516602, | |
| "reward_std": 12.557649612426758, | |
| "rewards/rollout_reward_func/mean": 6.598260402679443, | |
| "rewards/rollout_reward_func/std": 12.858835220336914, | |
| "sampling/importance_sampling_ratio/max": 1.6857366561889648, | |
| "sampling/importance_sampling_ratio/mean": 1.0335665941238403, | |
| "sampling/importance_sampling_ratio/min": 0.555221676826477, | |
| "sampling/sampling_logp_difference/max": 0.583274245262146, | |
| "sampling/sampling_logp_difference/mean": 0.010182222351431847, | |
| "step": 271, | |
| "step_time": 38.82532325500051 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03513071942143142, | |
| "clip_ratio/high_mean": 0.009650735417380929, | |
| "clip_ratio/low_mean": 0.020450367941521108, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.030101103708148003, | |
| "entropy": 0.17972450237721205, | |
| "epoch": 0.00544, | |
| "grad_norm": 0.39173194766044617, | |
| "kl": 0.5205750651657581, | |
| "learning_rate": 9.999896860008347e-05, | |
| "loss": -0.052, | |
| "step": 272, | |
| "step_time": 10.316601943999558 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0026041666860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004340277810115367, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1340.0, | |
| "completions/max_terminated_length": 1340.0, | |
| "completions/mean_length": 1249.921875, | |
| "completions/mean_terminated_length": 1249.921875, | |
| "completions/min_length": 663.0, | |
| "completions/min_terminated_length": 663.0, | |
| "entropy": 0.18446057755500078, | |
| "epoch": 0.00546, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5517368912696838, | |
| "kl": 0.5441189091652632, | |
| "learning_rate": 9.999895984092831e-05, | |
| "loss": 0.0131, | |
| "num_tokens": 14662474.0, | |
| "reward": 5.8217644691467285, | |
| "reward_std": 11.078777313232422, | |
| "rewards/rollout_reward_func/mean": 5.8217644691467285, | |
| "rewards/rollout_reward_func/std": 11.748489379882812, | |
| "sampling/importance_sampling_ratio/max": 2.5323374271392822, | |
| "sampling/importance_sampling_ratio/mean": 0.9794174432754517, | |
| "sampling/importance_sampling_ratio/min": 1.1464784742225287e-13, | |
| "sampling/sampling_logp_difference/max": 23.46839714050293, | |
| "sampling/sampling_logp_difference/mean": 0.03605649992823601, | |
| "step": 273, | |
| "step_time": 38.64782374399874 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.041483918437734246, | |
| "clip_ratio/high_mean": 0.01210709079168737, | |
| "clip_ratio/low_mean": 0.049096201779320836, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06120329239638522, | |
| "entropy": 0.17116414476186037, | |
| "epoch": 0.00548, | |
| "grad_norm": 0.3144800662994385, | |
| "kl": 0.6644695494323969, | |
| "learning_rate": 9.999895104473725e-05, | |
| "loss": 0.0043, | |
| "step": 274, | |
| "step_time": 8.841590996999912 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01756535959430039, | |
| "clip_ratio/high_mean": 0.006127451022621244, | |
| "clip_ratio/low_mean": 0.0026552287745289505, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008782679797150195, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1370.0, | |
| "completions/max_terminated_length": 1370.0, | |
| "completions/mean_length": 1246.375, | |
| "completions/mean_terminated_length": 1246.375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.17010682448744774, | |
| "epoch": 0.0055, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5666040182113647, | |
| "kl": 0.5793958213180304, | |
| "learning_rate": 9.99989422115103e-05, | |
| "loss": 0.0183, | |
| "num_tokens": 14793449.0, | |
| "reward": 2.9848508834838867, | |
| "reward_std": 12.649776458740234, | |
| "rewards/rollout_reward_func/mean": 2.984851121902466, | |
| "rewards/rollout_reward_func/std": 13.012813568115234, | |
| "sampling/importance_sampling_ratio/max": 1.5569446086883545, | |
| "sampling/importance_sampling_ratio/mean": 0.9847633838653564, | |
| "sampling/importance_sampling_ratio/min": 0.6424822807312012, | |
| "sampling/sampling_logp_difference/max": 0.4623146057128906, | |
| "sampling/sampling_logp_difference/mean": 0.00930742733180523, | |
| "step": 275, | |
| "step_time": 39.34428709400072 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.054125817492604256, | |
| "clip_ratio/high_mean": 0.016135621059220284, | |
| "clip_ratio/low_mean": 0.0301164222182706, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.046252043335698545, | |
| "entropy": 0.16701707802712917, | |
| "epoch": 0.00552, | |
| "grad_norm": 0.6509947180747986, | |
| "kl": 0.6457913182675838, | |
| "learning_rate": 9.999893334124744e-05, | |
| "loss": 0.0127, | |
| "step": 276, | |
| "step_time": 9.492375128998901 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0026041666860692203, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1342.0, | |
| "completions/max_terminated_length": 1342.0, | |
| "completions/mean_length": 1211.75, | |
| "completions/mean_terminated_length": 1211.75, | |
| "completions/min_length": 786.0, | |
| "completions/min_terminated_length": 786.0, | |
| "entropy": 0.16395934531465173, | |
| "epoch": 0.00554, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6039373874664307, | |
| "kl": 0.6709765158593655, | |
| "learning_rate": 9.999892443394869e-05, | |
| "loss": -0.0199, | |
| "num_tokens": 14922202.0, | |
| "reward": 9.567506790161133, | |
| "reward_std": 12.886774063110352, | |
| "rewards/rollout_reward_func/mean": 9.567506790161133, | |
| "rewards/rollout_reward_func/std": 14.272911071777344, | |
| "sampling/importance_sampling_ratio/max": 1.3702117204666138, | |
| "sampling/importance_sampling_ratio/mean": 0.9926258325576782, | |
| "sampling/importance_sampling_ratio/min": 2.4751771812714374e-13, | |
| "sampling/sampling_logp_difference/max": 22.218292236328125, | |
| "sampling/sampling_logp_difference/mean": 0.03479118272662163, | |
| "step": 277, | |
| "step_time": 38.66672314299831 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.056832108180969954, | |
| "clip_ratio/high_mean": 0.015944138227496296, | |
| "clip_ratio/low_mean": 0.026416234264615923, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04236037301598117, | |
| "entropy": 0.16752836294472218, | |
| "epoch": 0.00556, | |
| "grad_norm": 0.2952568829059601, | |
| "kl": 0.7055745627731085, | |
| "learning_rate": 9.999891548961409e-05, | |
| "loss": -0.0283, | |
| "step": 278, | |
| "step_time": 10.249573535998024 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0026552287745289505, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003523284336552024, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1344.0, | |
| "completions/max_terminated_length": 1344.0, | |
| "completions/mean_length": 1206.921875, | |
| "completions/mean_terminated_length": 1206.921875, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "entropy": 0.16296257637441158, | |
| "epoch": 0.00558, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6338427662849426, | |
| "kl": 0.5948988310992718, | |
| "learning_rate": 9.99989065082436e-05, | |
| "loss": -0.0193, | |
| "num_tokens": 15050626.0, | |
| "reward": 6.859679222106934, | |
| "reward_std": 13.335336685180664, | |
| "rewards/rollout_reward_func/mean": 6.859679222106934, | |
| "rewards/rollout_reward_func/std": 13.806427955627441, | |
| "sampling/importance_sampling_ratio/max": 1.73651123046875, | |
| "sampling/importance_sampling_ratio/mean": 1.0235867500305176, | |
| "sampling/importance_sampling_ratio/min": 0.6915313005447388, | |
| "sampling/sampling_logp_difference/max": 0.3299523591995239, | |
| "sampling/sampling_logp_difference/mean": 0.00790142547339201, | |
| "step": 279, | |
| "step_time": 37.97661670400066 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04227941203862429, | |
| "clip_ratio/high_mean": 0.01235702628036961, | |
| "clip_ratio/low_mean": 0.023852379759773612, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.036209406214766204, | |
| "entropy": 0.17103052884340286, | |
| "epoch": 0.0056, | |
| "grad_norm": 0.310857355594635, | |
| "kl": 0.6127588897943497, | |
| "learning_rate": 9.999889748983726e-05, | |
| "loss": -0.0289, | |
| "step": 280, | |
| "step_time": 9.740956478998669 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007148692850023508, | |
| "clip_ratio/high_mean": 0.001787173212505877, | |
| "clip_ratio/low_mean": 0.0035807291860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005367902398575097, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1347.0, | |
| "completions/max_terminated_length": 1347.0, | |
| "completions/mean_length": 1242.203125, | |
| "completions/mean_terminated_length": 1242.203125, | |
| "completions/min_length": 1071.0, | |
| "completions/min_terminated_length": 1071.0, | |
| "entropy": 0.18003392685204744, | |
| "epoch": 0.00562, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5553699731826782, | |
| "kl": 0.5920679531991482, | |
| "learning_rate": 9.999888843439508e-05, | |
| "loss": 0.0235, | |
| "num_tokens": 15181392.0, | |
| "reward": 3.9797325134277344, | |
| "reward_std": 11.883782386779785, | |
| "rewards/rollout_reward_func/mean": 3.9797325134277344, | |
| "rewards/rollout_reward_func/std": 12.557183265686035, | |
| "sampling/importance_sampling_ratio/max": 2.3216447830200195, | |
| "sampling/importance_sampling_ratio/mean": 1.0259038209915161, | |
| "sampling/importance_sampling_ratio/min": 0.37790024280548096, | |
| "sampling/sampling_logp_difference/max": 1.4781968593597412, | |
| "sampling/sampling_logp_difference/mean": 0.011046608909964561, | |
| "step": 281, | |
| "step_time": 39.572295284000575 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04312193673104048, | |
| "clip_ratio/high_mean": 0.01343571295728907, | |
| "clip_ratio/low_mean": 0.023201337666250765, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03663705079816282, | |
| "entropy": 0.18677309434860945, | |
| "epoch": 0.00564, | |
| "grad_norm": 0.5401102304458618, | |
| "kl": 0.5960894413292408, | |
| "learning_rate": 9.999887934191704e-05, | |
| "loss": 0.0166, | |
| "step": 282, | |
| "step_time": 9.04144253800041 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007582720601931214, | |
| "clip_ratio/high_mean": 0.0018956801504828036, | |
| "clip_ratio/low_mean": 0.0009191176504828036, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002814797800965607, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1338.0, | |
| "completions/max_terminated_length": 1338.0, | |
| "completions/mean_length": 1215.859375, | |
| "completions/mean_terminated_length": 1215.859375, | |
| "completions/min_length": 1067.0, | |
| "completions/min_terminated_length": 1067.0, | |
| "entropy": 0.18976869899779558, | |
| "epoch": 0.00566, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6778466105461121, | |
| "kl": 0.8045855388045311, | |
| "learning_rate": 9.99988702124032e-05, | |
| "loss": 0.0435, | |
| "num_tokens": 15310402.0, | |
| "reward": 9.532302856445312, | |
| "reward_std": 13.447786331176758, | |
| "rewards/rollout_reward_func/mean": 9.532302856445312, | |
| "rewards/rollout_reward_func/std": 14.893537521362305, | |
| "sampling/importance_sampling_ratio/max": 1.5014206171035767, | |
| "sampling/importance_sampling_ratio/mean": 0.984979510307312, | |
| "sampling/importance_sampling_ratio/min": 0.5918754935264587, | |
| "sampling/sampling_logp_difference/max": 0.49157631397247314, | |
| "sampling/sampling_logp_difference/mean": 0.009644631296396255, | |
| "step": 283, | |
| "step_time": 37.75929147200077 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04337724717333913, | |
| "clip_ratio/high_mean": 0.017022824671585113, | |
| "clip_ratio/low_mean": 0.019767412508372217, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.036790237470995635, | |
| "entropy": 0.19258240424096584, | |
| "epoch": 0.00568, | |
| "grad_norm": 0.23295848071575165, | |
| "kl": 0.7846251800656319, | |
| "learning_rate": 9.999886104585351e-05, | |
| "loss": 0.0377, | |
| "step": 284, | |
| "step_time": 9.718582901003174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0036764706601388752, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004544526163954288, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1352.0, | |
| "completions/max_terminated_length": 1352.0, | |
| "completions/mean_length": 1182.8125, | |
| "completions/mean_terminated_length": 1182.8125, | |
| "completions/min_length": 644.0, | |
| "completions/min_terminated_length": 644.0, | |
| "entropy": 0.21535112708806992, | |
| "epoch": 0.0057, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8615608215332031, | |
| "kl": 0.8677664548158646, | |
| "learning_rate": 9.999885184226802e-05, | |
| "loss": -0.0133, | |
| "num_tokens": 15437277.0, | |
| "reward": 4.226072788238525, | |
| "reward_std": 8.478089332580566, | |
| "rewards/rollout_reward_func/mean": 4.226072788238525, | |
| "rewards/rollout_reward_func/std": 9.509638786315918, | |
| "sampling/importance_sampling_ratio/max": 1.921204924583435, | |
| "sampling/importance_sampling_ratio/mean": 0.9768272638320923, | |
| "sampling/importance_sampling_ratio/min": 0.7244350910186768, | |
| "sampling/sampling_logp_difference/max": 0.4400520324707031, | |
| "sampling/sampling_logp_difference/mean": 0.010012689046561718, | |
| "step": 285, | |
| "step_time": 36.26275280400023 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04353043343871832, | |
| "clip_ratio/high_mean": 0.01611264329403639, | |
| "clip_ratio/low_mean": 0.029692606767639518, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.045805250061675906, | |
| "entropy": 0.21773250121623278, | |
| "epoch": 0.00572, | |
| "grad_norm": 0.48840415477752686, | |
| "kl": 1.049767030403018, | |
| "learning_rate": 9.999884260164671e-05, | |
| "loss": -0.0254, | |
| "step": 286, | |
| "step_time": 10.79683871799898 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01797385630197823, | |
| "clip_ratio/high_mean": 0.004493464075494558, | |
| "clip_ratio/low_mean": 0.003523284336552024, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008016748412046582, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1349.0, | |
| "completions/max_terminated_length": 1349.0, | |
| "completions/mean_length": 1224.109375, | |
| "completions/mean_terminated_length": 1224.109375, | |
| "completions/min_length": 697.0, | |
| "completions/min_terminated_length": 697.0, | |
| "entropy": 0.22404625453054905, | |
| "epoch": 0.00574, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.1797651052474976, | |
| "kl": 0.8139622360467911, | |
| "learning_rate": 9.999883332398962e-05, | |
| "loss": -0.0606, | |
| "num_tokens": 15566944.0, | |
| "reward": 5.630161762237549, | |
| "reward_std": 12.35897445678711, | |
| "rewards/rollout_reward_func/mean": 5.630161762237549, | |
| "rewards/rollout_reward_func/std": 13.792024612426758, | |
| "sampling/importance_sampling_ratio/max": 2.566322088241577, | |
| "sampling/importance_sampling_ratio/mean": 0.9811519384384155, | |
| "sampling/importance_sampling_ratio/min": 0.388390451669693, | |
| "sampling/sampling_logp_difference/max": 1.9641337394714355, | |
| "sampling/sampling_logp_difference/mean": 0.015188181772828102, | |
| "step": 287, | |
| "step_time": 36.44446017899918 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.053717320784926414, | |
| "clip_ratio/high_mean": 0.01695261470740661, | |
| "clip_ratio/low_mean": 0.04400914063444361, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.060961755632888526, | |
| "entropy": 0.2100704526528716, | |
| "epoch": 0.00576, | |
| "grad_norm": 1.2667500972747803, | |
| "kl": 2.074540827423334, | |
| "learning_rate": 9.999882400929674e-05, | |
| "loss": -0.057, | |
| "step": 288, | |
| "step_time": 8.99862431100064 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010850694496184587, | |
| "clip_ratio/high_mean": 0.0035807291860692203, | |
| "clip_ratio/low_mean": 0.001787173212505877, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005367902398575097, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1377.0, | |
| "completions/max_terminated_length": 1377.0, | |
| "completions/mean_length": 1206.5, | |
| "completions/mean_terminated_length": 1206.5, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "entropy": 0.21379029098898172, | |
| "epoch": 0.00578, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9330686926841736, | |
| "kl": 0.6734739989042282, | |
| "learning_rate": 9.999881465756809e-05, | |
| "loss": -0.0075, | |
| "num_tokens": 15695392.0, | |
| "reward": 5.131735801696777, | |
| "reward_std": 13.59388256072998, | |
| "rewards/rollout_reward_func/mean": 5.1317362785339355, | |
| "rewards/rollout_reward_func/std": 15.563157081604004, | |
| "sampling/importance_sampling_ratio/max": 1.5120335817337036, | |
| "sampling/importance_sampling_ratio/mean": 0.9915132522583008, | |
| "sampling/importance_sampling_ratio/min": 0.7389032244682312, | |
| "sampling/sampling_logp_difference/max": 0.3448265790939331, | |
| "sampling/sampling_logp_difference/mean": 0.010639440268278122, | |
| "step": 289, | |
| "step_time": 38.13999567300107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.052309082355350256, | |
| "clip_ratio/high_mean": 0.015732499363366514, | |
| "clip_ratio/low_mean": 0.04032860859297216, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.056061108596622944, | |
| "entropy": 0.20592329651117325, | |
| "epoch": 0.0058, | |
| "grad_norm": 1.7719804048538208, | |
| "kl": 2.317722400650382, | |
| "learning_rate": 9.999880526880367e-05, | |
| "loss": 0.0124, | |
| "step": 290, | |
| "step_time": 9.157000397000957 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1351.0, | |
| "completions/max_terminated_length": 1351.0, | |
| "completions/mean_length": 1227.28125, | |
| "completions/mean_terminated_length": 1227.2381591796875, | |
| "completions/min_length": 1085.0, | |
| "completions/min_terminated_length": 1085.0, | |
| "entropy": 0.2259034337475896, | |
| "epoch": 0.00582, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5913375616073608, | |
| "kl": 0.5888024400919676, | |
| "learning_rate": 9.999879584300349e-05, | |
| "loss": -0.0201, | |
| "num_tokens": 15825170.0, | |
| "reward": 4.390281677246094, | |
| "reward_std": 13.705522537231445, | |
| "rewards/rollout_reward_func/mean": 4.390281677246094, | |
| "rewards/rollout_reward_func/std": 13.848193168640137, | |
| "sampling/importance_sampling_ratio/max": 1.4006119966506958, | |
| "sampling/importance_sampling_ratio/mean": 0.9845026731491089, | |
| "sampling/importance_sampling_ratio/min": 0.57123863697052, | |
| "sampling/sampling_logp_difference/max": 0.4590674638748169, | |
| "sampling/sampling_logp_difference/mean": 0.010318214073777199, | |
| "step": 291, | |
| "step_time": 38.378031336002095 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05575980432331562, | |
| "clip_ratio/high_mean": 0.018331290979404002, | |
| "clip_ratio/low_mean": 0.025366254791151732, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0436975461198017, | |
| "entropy": 0.25106900557875633, | |
| "epoch": 0.00584, | |
| "grad_norm": 0.3503086268901825, | |
| "kl": 0.6040437389165163, | |
| "learning_rate": 9.999878638016755e-05, | |
| "loss": -0.0261, | |
| "step": 292, | |
| "step_time": 10.2765881680034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1355.0, | |
| "completions/max_terminated_length": 1355.0, | |
| "completions/mean_length": 1203.28125, | |
| "completions/mean_terminated_length": 1203.28125, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "entropy": 0.255804393440485, | |
| "epoch": 0.00586, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.733392059803009, | |
| "kl": 0.7084890268743038, | |
| "learning_rate": 9.99987768802959e-05, | |
| "loss": -0.0295, | |
| "num_tokens": 15953483.0, | |
| "reward": 4.652621269226074, | |
| "reward_std": 13.876627922058105, | |
| "rewards/rollout_reward_func/mean": 4.652621269226074, | |
| "rewards/rollout_reward_func/std": 14.444734573364258, | |
| "sampling/importance_sampling_ratio/max": 1.5388283729553223, | |
| "sampling/importance_sampling_ratio/mean": 0.9943192005157471, | |
| "sampling/importance_sampling_ratio/min": 0.66633540391922, | |
| "sampling/sampling_logp_difference/max": 0.3228440284729004, | |
| "sampling/sampling_logp_difference/mean": 0.009999222122132778, | |
| "step": 293, | |
| "step_time": 36.11094247699839 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07255117082968354, | |
| "clip_ratio/high_mean": 0.020741959451697767, | |
| "clip_ratio/low_mean": 0.026092729007359594, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0468346884008497, | |
| "entropy": 0.2808321360498667, | |
| "epoch": 0.00588, | |
| "grad_norm": 0.3306209444999695, | |
| "kl": 0.6640463471412659, | |
| "learning_rate": 9.99987673433885e-05, | |
| "loss": -0.0368, | |
| "step": 294, | |
| "step_time": 9.500305491999825 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0026041666860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1353.0, | |
| "completions/max_terminated_length": 1353.0, | |
| "completions/mean_length": 1203.546875, | |
| "completions/mean_terminated_length": 1203.546875, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "entropy": 0.29141946602612734, | |
| "epoch": 0.0059, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7943304777145386, | |
| "kl": 0.6770852543413639, | |
| "learning_rate": 9.999875776944538e-05, | |
| "loss": -0.0049, | |
| "num_tokens": 16081715.0, | |
| "reward": 2.4234745502471924, | |
| "reward_std": 10.446115493774414, | |
| "rewards/rollout_reward_func/mean": 2.4234743118286133, | |
| "rewards/rollout_reward_func/std": 11.454586029052734, | |
| "sampling/importance_sampling_ratio/max": 1.456477165222168, | |
| "sampling/importance_sampling_ratio/mean": 0.9954730272293091, | |
| "sampling/importance_sampling_ratio/min": 0.6373972296714783, | |
| "sampling/sampling_logp_difference/max": 0.3991684913635254, | |
| "sampling/sampling_logp_difference/mean": 0.010583357885479927, | |
| "step": 295, | |
| "step_time": 37.65815602800012 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.049019608180969954, | |
| "clip_ratio/high_mean": 0.022671569080557674, | |
| "clip_ratio/low_mean": 0.029513889458030462, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05218545877141878, | |
| "entropy": 0.2855409812182188, | |
| "epoch": 0.00592, | |
| "grad_norm": 0.3455994129180908, | |
| "kl": 0.6785521320998669, | |
| "learning_rate": 9.999874815846655e-05, | |
| "loss": -0.0152, | |
| "step": 296, | |
| "step_time": 8.978216180002164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010620915098115802, | |
| "clip_ratio/high_mean": 0.003523284336552024, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004391339898575097, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1353.0, | |
| "completions/max_terminated_length": 1353.0, | |
| "completions/mean_length": 1205.796875, | |
| "completions/mean_terminated_length": 1205.796875, | |
| "completions/min_length": 210.0, | |
| "completions/min_terminated_length": 210.0, | |
| "entropy": 0.3199264472350478, | |
| "epoch": 0.00594, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5850762128829956, | |
| "kl": 0.8113113101571798, | |
| "learning_rate": 9.999873851045201e-05, | |
| "loss": -0.0035, | |
| "num_tokens": 16210207.0, | |
| "reward": 2.8288328647613525, | |
| "reward_std": 14.778526306152344, | |
| "rewards/rollout_reward_func/mean": 2.8288326263427734, | |
| "rewards/rollout_reward_func/std": 16.02610969543457, | |
| "sampling/importance_sampling_ratio/max": 1.4778478145599365, | |
| "sampling/importance_sampling_ratio/mean": 1.0175740718841553, | |
| "sampling/importance_sampling_ratio/min": 0.6004241108894348, | |
| "sampling/sampling_logp_difference/max": 0.35140562057495117, | |
| "sampling/sampling_logp_difference/mean": 0.012288028374314308, | |
| "step": 297, | |
| "step_time": 36.8206370079979 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0490196084138006, | |
| "clip_ratio/high_mean": 0.015590063121635467, | |
| "clip_ratio/low_mean": 0.026909722771961242, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04249978606821969, | |
| "entropy": 0.3119704835116863, | |
| "epoch": 0.00596, | |
| "grad_norm": 0.6210339069366455, | |
| "kl": 0.8435764815658331, | |
| "learning_rate": 9.99987288254018e-05, | |
| "loss": -0.0125, | |
| "step": 298, | |
| "step_time": 9.949690105999252 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1320.0, | |
| "completions/max_terminated_length": 1320.0, | |
| "completions/mean_length": 1166.171875, | |
| "completions/mean_terminated_length": 1166.171875, | |
| "completions/min_length": 865.0, | |
| "completions/min_terminated_length": 865.0, | |
| "entropy": 0.3107016496360302, | |
| "epoch": 0.00598, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7741795778274536, | |
| "kl": 0.8151105176657438, | |
| "learning_rate": 9.99987191033159e-05, | |
| "loss": 0.013, | |
| "num_tokens": 16335952.0, | |
| "reward": 0.605268120765686, | |
| "reward_std": 9.44769287109375, | |
| "rewards/rollout_reward_func/mean": 0.6052679419517517, | |
| "rewards/rollout_reward_func/std": 10.618112564086914, | |
| "sampling/importance_sampling_ratio/max": 1.4987179040908813, | |
| "sampling/importance_sampling_ratio/mean": 1.0136826038360596, | |
| "sampling/importance_sampling_ratio/min": 0.7334418892860413, | |
| "sampling/sampling_logp_difference/max": 0.23920416831970215, | |
| "sampling/sampling_logp_difference/mean": 0.011820180341601372, | |
| "step": 299, | |
| "step_time": 35.96857580300002 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07089971494860947, | |
| "clip_ratio/high_mean": 0.022029462968930602, | |
| "clip_ratio/low_mean": 0.03730450588045642, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05933396948967129, | |
| "entropy": 0.28032723255455494, | |
| "epoch": 0.006, | |
| "grad_norm": 0.37243181467056274, | |
| "kl": 0.8571038488298655, | |
| "learning_rate": 9.999870934419433e-05, | |
| "loss": -0.0014, | |
| "step": 300, | |
| "step_time": 9.851978641999267 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.014093137346208096, | |
| "clip_ratio/high_mean": 0.004391339898575097, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005259395460598171, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1347.0, | |
| "completions/max_terminated_length": 1347.0, | |
| "completions/mean_length": 1216.4375, | |
| "completions/mean_terminated_length": 1216.4375, | |
| "completions/min_length": 825.0, | |
| "completions/min_terminated_length": 825.0, | |
| "entropy": 0.2521855002269149, | |
| "epoch": 0.00602, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6381392478942871, | |
| "kl": 0.8058282844722271, | |
| "learning_rate": 9.999869954803708e-05, | |
| "loss": 0.0246, | |
| "num_tokens": 16465057.0, | |
| "reward": 4.353306293487549, | |
| "reward_std": 11.903841018676758, | |
| "rewards/rollout_reward_func/mean": 4.353306293487549, | |
| "rewards/rollout_reward_func/std": 13.071228981018066, | |
| "sampling/importance_sampling_ratio/max": 1.870285153388977, | |
| "sampling/importance_sampling_ratio/mean": 1.014232873916626, | |
| "sampling/importance_sampling_ratio/min": 0.6221296191215515, | |
| "sampling/sampling_logp_difference/max": 0.5893880128860474, | |
| "sampling/sampling_logp_difference/mean": 0.010516786947846413, | |
| "step": 301, | |
| "step_time": 37.21239350799988 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0890522887930274, | |
| "clip_ratio/high_mean": 0.02920751681085676, | |
| "clip_ratio/low_mean": 0.026308735250495374, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05551625177031383, | |
| "entropy": 0.26322738360613585, | |
| "epoch": 0.00604, | |
| "grad_norm": 0.3497966527938843, | |
| "kl": 0.8271188456565142, | |
| "learning_rate": 9.999868971484418e-05, | |
| "loss": 0.0178, | |
| "step": 302, | |
| "step_time": 9.60695320800096 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007504480192437768, | |
| "clip_ratio/high_mean": 0.0027441755519248545, | |
| "clip_ratio/low_mean": 0.004391339898575097, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007135515450499952, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1357.0, | |
| "completions/max_terminated_length": 1357.0, | |
| "completions/mean_length": 1233.578125, | |
| "completions/mean_terminated_length": 1232.1270751953125, | |
| "completions/min_length": 963.0, | |
| "completions/min_terminated_length": 963.0, | |
| "entropy": 0.3282460719347, | |
| "epoch": 0.00606, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8028053641319275, | |
| "kl": 1.0723739713430405, | |
| "learning_rate": 9.999867984461563e-05, | |
| "loss": -0.009, | |
| "num_tokens": 16595305.0, | |
| "reward": 4.948282718658447, | |
| "reward_std": 13.311994552612305, | |
| "rewards/rollout_reward_func/mean": 4.9482831954956055, | |
| "rewards/rollout_reward_func/std": 13.96406078338623, | |
| "sampling/importance_sampling_ratio/max": 1.4959264993667603, | |
| "sampling/importance_sampling_ratio/mean": 1.0020052194595337, | |
| "sampling/importance_sampling_ratio/min": 0.664537250995636, | |
| "sampling/sampling_logp_difference/max": 0.4060518741607666, | |
| "sampling/sampling_logp_difference/mean": 0.01389513909816742, | |
| "step": 303, | |
| "step_time": 37.319652419000704 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.10214776475913823, | |
| "clip_ratio/high_mean": 0.0419843090348877, | |
| "clip_ratio/low_mean": 0.032362769707106054, | |
| "clip_ratio/low_min": 0.0029761905316263437, | |
| "clip_ratio/region_mean": 0.07434707973152399, | |
| "entropy": 0.35452230647206306, | |
| "epoch": 0.00608, | |
| "grad_norm": 0.5315479040145874, | |
| "kl": 0.8896235972642899, | |
| "learning_rate": 9.999866993735147e-05, | |
| "loss": -0.0191, | |
| "step": 304, | |
| "step_time": 9.31032760600101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0026041666860692203, | |
| "clip_ratio/low_mean": 0.0013720877468585968, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003976254432927817, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1349.0, | |
| "completions/max_terminated_length": 1349.0, | |
| "completions/mean_length": 1178.75, | |
| "completions/mean_terminated_length": 1178.75, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "entropy": 0.3360240999609232, | |
| "epoch": 0.0061, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8160866498947144, | |
| "kl": 0.7584020271897316, | |
| "learning_rate": 9.999865999305169e-05, | |
| "loss": 0.0343, | |
| "num_tokens": 16721992.0, | |
| "reward": 5.459122657775879, | |
| "reward_std": 12.891645431518555, | |
| "rewards/rollout_reward_func/mean": 5.459122657775879, | |
| "rewards/rollout_reward_func/std": 13.743046760559082, | |
| "sampling/importance_sampling_ratio/max": 1.5625214576721191, | |
| "sampling/importance_sampling_ratio/mean": 0.9862264394760132, | |
| "sampling/importance_sampling_ratio/min": 0.7355522513389587, | |
| "sampling/sampling_logp_difference/max": 0.3090386390686035, | |
| "sampling/sampling_logp_difference/mean": 0.012195384129881859, | |
| "step": 305, | |
| "step_time": 36.28136819499923 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.10116884484887123, | |
| "clip_ratio/high_mean": 0.03489725984400138, | |
| "clip_ratio/low_mean": 0.055271854158490896, | |
| "clip_ratio/low_min": 0.0069444444961845875, | |
| "clip_ratio/region_mean": 0.09016911429353058, | |
| "entropy": 0.3460291214287281, | |
| "epoch": 0.00612, | |
| "grad_norm": 0.4136711359024048, | |
| "kl": 0.7873252909630537, | |
| "learning_rate": 9.999865001171627e-05, | |
| "loss": 0.0177, | |
| "step": 306, | |
| "step_time": 10.277970246998848 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017422385746613145, | |
| "clip_ratio/high_mean": 0.004355596436653286, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00522365199867636, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1335.0, | |
| "completions/max_terminated_length": 1335.0, | |
| "completions/mean_length": 1148.03125, | |
| "completions/mean_terminated_length": 1148.03125, | |
| "completions/min_length": 197.0, | |
| "completions/min_terminated_length": 197.0, | |
| "entropy": 0.4305746052414179, | |
| "epoch": 0.00614, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9119190573692322, | |
| "kl": 0.862309418618679, | |
| "learning_rate": 9.999863999334527e-05, | |
| "loss": -0.039, | |
| "num_tokens": 16846641.0, | |
| "reward": 4.154011249542236, | |
| "reward_std": 13.017316818237305, | |
| "rewards/rollout_reward_func/mean": 4.1540117263793945, | |
| "rewards/rollout_reward_func/std": 12.931968688964844, | |
| "sampling/importance_sampling_ratio/max": 1.4475128650665283, | |
| "sampling/importance_sampling_ratio/mean": 0.9774882793426514, | |
| "sampling/importance_sampling_ratio/min": 9.214395739476355e-13, | |
| "sampling/sampling_logp_difference/max": 23.965322494506836, | |
| "sampling/sampling_logp_difference/mean": 0.03728090599179268, | |
| "step": 307, | |
| "step_time": 33.57406117500068 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08014640025794506, | |
| "clip_ratio/high_mean": 0.026986419688910246, | |
| "clip_ratio/low_mean": 0.03807902126573026, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.06506544025614858, | |
| "entropy": 0.4615292586386204, | |
| "epoch": 0.00616, | |
| "grad_norm": 0.544769287109375, | |
| "kl": 0.8701771721243858, | |
| "learning_rate": 9.999862993793865e-05, | |
| "loss": -0.0498, | |
| "step": 308, | |
| "step_time": 9.750469571001304 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0021551724057644606, | |
| "clip_ratio/high_mean": 0.0005387931014411151, | |
| "clip_ratio/low_mean": 0.004073183808941394, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004611976910382509, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1337.0, | |
| "completions/max_terminated_length": 1337.0, | |
| "completions/mean_length": 1176.34375, | |
| "completions/mean_terminated_length": 1176.34375, | |
| "completions/min_length": 862.0, | |
| "completions/min_terminated_length": 862.0, | |
| "entropy": 0.4520879667252302, | |
| "epoch": 0.00618, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6592026352882385, | |
| "kl": 1.228204183280468, | |
| "learning_rate": 9.999861984549645e-05, | |
| "loss": 0.0146, | |
| "num_tokens": 16973130.0, | |
| "reward": 5.186724662780762, | |
| "reward_std": 12.892146110534668, | |
| "rewards/rollout_reward_func/mean": 5.186724662780762, | |
| "rewards/rollout_reward_func/std": 12.396245002746582, | |
| "sampling/importance_sampling_ratio/max": 1.4907543659210205, | |
| "sampling/importance_sampling_ratio/mean": 0.992376446723938, | |
| "sampling/importance_sampling_ratio/min": 0.6941927671432495, | |
| "sampling/sampling_logp_difference/max": 0.338625431060791, | |
| "sampling/sampling_logp_difference/mean": 0.014916637912392616, | |
| "step": 309, | |
| "step_time": 35.957562657998096 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07991837477311492, | |
| "clip_ratio/high_mean": 0.02518792706541717, | |
| "clip_ratio/low_mean": 0.04024840978672728, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06543633691035211, | |
| "entropy": 0.44149017706513405, | |
| "epoch": 0.0062, | |
| "grad_norm": 0.5282915234565735, | |
| "kl": 1.2467477656900883, | |
| "learning_rate": 9.999860971601868e-05, | |
| "loss": -0.002, | |
| "step": 310, | |
| "step_time": 8.980034561999673 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0008223684271797538, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016904239892028272, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1348.0, | |
| "completions/max_terminated_length": 1348.0, | |
| "completions/mean_length": 1184.96875, | |
| "completions/mean_terminated_length": 1184.96875, | |
| "completions/min_length": 917.0, | |
| "completions/min_terminated_length": 917.0, | |
| "entropy": 0.5129956435412169, | |
| "epoch": 0.00622, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7465701699256897, | |
| "kl": 1.0012187995016575, | |
| "learning_rate": 9.999859954950535e-05, | |
| "loss": 0.018, | |
| "num_tokens": 17100245.0, | |
| "reward": 4.89565372467041, | |
| "reward_std": 13.874456405639648, | |
| "rewards/rollout_reward_func/mean": 4.89565372467041, | |
| "rewards/rollout_reward_func/std": 14.702526092529297, | |
| "sampling/importance_sampling_ratio/max": 1.5052223205566406, | |
| "sampling/importance_sampling_ratio/mean": 1.027785062789917, | |
| "sampling/importance_sampling_ratio/min": 0.5468899607658386, | |
| "sampling/sampling_logp_difference/max": 0.4049875736236572, | |
| "sampling/sampling_logp_difference/mean": 0.017239127308130264, | |
| "step": 311, | |
| "step_time": 34.84647880300054 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08282635360956192, | |
| "clip_ratio/high_mean": 0.031164190906565636, | |
| "clip_ratio/low_mean": 0.05087516509229317, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.08203935588244349, | |
| "entropy": 0.5392583776265383, | |
| "epoch": 0.00624, | |
| "grad_norm": 0.5834032297134399, | |
| "kl": 1.0580051615834236, | |
| "learning_rate": 9.999858934595648e-05, | |
| "loss": 0.0006, | |
| "step": 312, | |
| "step_time": 9.797768173001714 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.006076388992369175, | |
| "clip_ratio/high_mean": 0.0015190972480922937, | |
| "clip_ratio/low_mean": 0.0006793478387407959, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0021984450868330896, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1312.0, | |
| "completions/max_terminated_length": 1312.0, | |
| "completions/mean_length": 1134.828125, | |
| "completions/mean_terminated_length": 1133.3968505859375, | |
| "completions/min_length": 492.0, | |
| "completions/min_terminated_length": 492.0, | |
| "entropy": 0.5534908715635538, | |
| "epoch": 0.00626, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7070833444595337, | |
| "kl": 0.9324860982596874, | |
| "learning_rate": 9.999857910537204e-05, | |
| "loss": 0.0171, | |
| "num_tokens": 17224021.0, | |
| "reward": 2.2121267318725586, | |
| "reward_std": 12.934229850769043, | |
| "rewards/rollout_reward_func/mean": 2.2121264934539795, | |
| "rewards/rollout_reward_func/std": 13.348692893981934, | |
| "sampling/importance_sampling_ratio/max": 1.3357430696487427, | |
| "sampling/importance_sampling_ratio/mean": 0.9801706075668335, | |
| "sampling/importance_sampling_ratio/min": 0.6364750862121582, | |
| "sampling/sampling_logp_difference/max": 0.28098082542419434, | |
| "sampling/sampling_logp_difference/mean": 0.01595621556043625, | |
| "step": 313, | |
| "step_time": 34.49526453500039 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.09578519035130739, | |
| "clip_ratio/high_mean": 0.028240888088475913, | |
| "clip_ratio/low_mean": 0.043463885551318526, | |
| "clip_ratio/low_min": 0.003289473708719015, | |
| "clip_ratio/region_mean": 0.07170477387262508, | |
| "entropy": 0.5145694836974144, | |
| "epoch": 0.00628, | |
| "grad_norm": 8.991610527038574, | |
| "kl": 2.500880379229784, | |
| "learning_rate": 9.999856882775207e-05, | |
| "loss": 0.0362, | |
| "step": 314, | |
| "step_time": 9.686568430998705 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0008680555620230734, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1341.0, | |
| "completions/max_terminated_length": 1341.0, | |
| "completions/mean_length": 1152.8125, | |
| "completions/mean_terminated_length": 1152.8125, | |
| "completions/min_length": 203.0, | |
| "completions/min_terminated_length": 203.0, | |
| "entropy": 0.47399672865867615, | |
| "epoch": 0.0063, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9034998416900635, | |
| "kl": 0.8184376284480095, | |
| "learning_rate": 9.999855851309658e-05, | |
| "loss": 0.0293, | |
| "num_tokens": 17349042.0, | |
| "reward": 2.9872653484344482, | |
| "reward_std": 10.313895225524902, | |
| "rewards/rollout_reward_func/mean": 2.9872655868530273, | |
| "rewards/rollout_reward_func/std": 11.123116493225098, | |
| "sampling/importance_sampling_ratio/max": 1.5023268461227417, | |
| "sampling/importance_sampling_ratio/mean": 0.9912445545196533, | |
| "sampling/importance_sampling_ratio/min": 0.5293837189674377, | |
| "sampling/sampling_logp_difference/max": 0.49621057510375977, | |
| "sampling/sampling_logp_difference/mean": 0.01648723892867565, | |
| "step": 315, | |
| "step_time": 36.106211563002034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07373366155661643, | |
| "clip_ratio/high_mean": 0.02871302078710869, | |
| "clip_ratio/low_mean": 0.052897133806254715, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.08161015470977873, | |
| "entropy": 0.439556997269392, | |
| "epoch": 0.00632, | |
| "grad_norm": 1.1542975902557373, | |
| "kl": 0.8081017658114433, | |
| "learning_rate": 9.999854816140556e-05, | |
| "loss": 0.0112, | |
| "step": 316, | |
| "step_time": 9.529359940000177 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1345.0, | |
| "completions/max_terminated_length": 1345.0, | |
| "completions/mean_length": 1153.015625, | |
| "completions/mean_terminated_length": 1152.2857666015625, | |
| "completions/min_length": 391.0, | |
| "completions/min_terminated_length": 391.0, | |
| "entropy": 0.4202824104577303, | |
| "epoch": 0.00634, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8387591242790222, | |
| "kl": 0.7999376337975264, | |
| "learning_rate": 9.999853777267906e-05, | |
| "loss": -0.0113, | |
| "num_tokens": 17474080.0, | |
| "reward": 3.8928003311157227, | |
| "reward_std": 13.945871353149414, | |
| "rewards/rollout_reward_func/mean": 3.8928003311157227, | |
| "rewards/rollout_reward_func/std": 14.018685340881348, | |
| "sampling/importance_sampling_ratio/max": 1.3972409963607788, | |
| "sampling/importance_sampling_ratio/mean": 0.9933174252510071, | |
| "sampling/importance_sampling_ratio/min": 0.66861891746521, | |
| "sampling/sampling_logp_difference/max": 0.3364081382751465, | |
| "sampling/sampling_logp_difference/mean": 0.013292517513036728, | |
| "step": 317, | |
| "step_time": 35.11344056699909 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.062046968610957265, | |
| "clip_ratio/high_mean": 0.02252296026563272, | |
| "clip_ratio/low_mean": 0.06663749110884964, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.08916045201476663, | |
| "entropy": 0.3698996100574732, | |
| "epoch": 0.00636, | |
| "grad_norm": 0.5773271918296814, | |
| "kl": 0.9309169836342335, | |
| "learning_rate": 9.999852734691706e-05, | |
| "loss": -0.0303, | |
| "step": 318, | |
| "step_time": 9.084739140999773 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0034635705524124205, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034635705524124205, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1360.0, | |
| "completions/max_terminated_length": 1360.0, | |
| "completions/mean_length": 1189.828125, | |
| "completions/mean_terminated_length": 1189.828125, | |
| "completions/min_length": 1056.0, | |
| "completions/min_terminated_length": 1056.0, | |
| "entropy": 0.3289623577147722, | |
| "epoch": 0.00638, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7116008400917053, | |
| "kl": 0.9500053711235523, | |
| "learning_rate": 9.999851688411959e-05, | |
| "loss": 0.0123, | |
| "num_tokens": 17601410.0, | |
| "reward": 4.444620609283447, | |
| "reward_std": 12.232638359069824, | |
| "rewards/rollout_reward_func/mean": 4.444620609283447, | |
| "rewards/rollout_reward_func/std": 12.037857055664062, | |
| "sampling/importance_sampling_ratio/max": 1.8450855016708374, | |
| "sampling/importance_sampling_ratio/mean": 0.9873309135437012, | |
| "sampling/importance_sampling_ratio/min": 2.6370022485067146e-11, | |
| "sampling/sampling_logp_difference/max": 11.255170822143555, | |
| "sampling/sampling_logp_difference/mean": 0.033629726618528366, | |
| "step": 319, | |
| "step_time": 38.23992628900032 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06827694294042885, | |
| "clip_ratio/high_mean": 0.024013680347707123, | |
| "clip_ratio/low_mean": 0.04197527136420831, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06598895112983882, | |
| "entropy": 0.3273693434894085, | |
| "epoch": 0.0064, | |
| "grad_norm": 0.5724111795425415, | |
| "kl": 1.0788306891918182, | |
| "learning_rate": 9.999850638428662e-05, | |
| "loss": 0.0049, | |
| "step": 320, | |
| "step_time": 10.348325264999403 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009027777938172221, | |
| "clip_ratio/high_mean": 0.0022569444845430553, | |
| "clip_ratio/low_mean": 0.002170138934161514, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004427083418704569, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1336.0, | |
| "completions/max_terminated_length": 1336.0, | |
| "completions/mean_length": 1203.65625, | |
| "completions/mean_terminated_length": 1203.65625, | |
| "completions/min_length": 898.0, | |
| "completions/min_terminated_length": 898.0, | |
| "entropy": 0.3083435148000717, | |
| "epoch": 0.00642, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8611142039299011, | |
| "kl": 0.9252029061317444, | |
| "learning_rate": 9.99984958474182e-05, | |
| "loss": 0.024, | |
| "num_tokens": 17729704.0, | |
| "reward": 2.3030171394348145, | |
| "reward_std": 10.394119262695312, | |
| "rewards/rollout_reward_func/mean": 2.3030171394348145, | |
| "rewards/rollout_reward_func/std": 11.775047302246094, | |
| "sampling/importance_sampling_ratio/max": 1.6587164402008057, | |
| "sampling/importance_sampling_ratio/mean": 1.0117642879486084, | |
| "sampling/importance_sampling_ratio/min": 0.4190000295639038, | |
| "sampling/sampling_logp_difference/max": 0.3578883409500122, | |
| "sampling/sampling_logp_difference/mean": 0.015003521926701069, | |
| "step": 321, | |
| "step_time": 35.224505068000326 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07542938669212162, | |
| "clip_ratio/high_mean": 0.03128034179098904, | |
| "clip_ratio/low_mean": 0.03897239360958338, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0702527352841571, | |
| "entropy": 0.2796294568106532, | |
| "epoch": 0.00644, | |
| "grad_norm": 0.5799975395202637, | |
| "kl": 0.8807330075651407, | |
| "learning_rate": 9.999848527351433e-05, | |
| "loss": 0.0091, | |
| "step": 322, | |
| "step_time": 9.679342022999663 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1294.0, | |
| "completions/max_terminated_length": 1294.0, | |
| "completions/mean_length": 1194.96875, | |
| "completions/mean_terminated_length": 1194.96875, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "entropy": 0.24392448458820581, | |
| "epoch": 0.00646, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7959789633750916, | |
| "kl": 0.779301343485713, | |
| "learning_rate": 9.9998474662575e-05, | |
| "loss": -0.0199, | |
| "num_tokens": 17857450.0, | |
| "reward": 4.581869125366211, | |
| "reward_std": 11.262429237365723, | |
| "rewards/rollout_reward_func/mean": 4.581869602203369, | |
| "rewards/rollout_reward_func/std": 12.287596702575684, | |
| "sampling/importance_sampling_ratio/max": 2.240818977355957, | |
| "sampling/importance_sampling_ratio/mean": 1.018520712852478, | |
| "sampling/importance_sampling_ratio/min": 0.3999040722846985, | |
| "sampling/sampling_logp_difference/max": 0.5928263664245605, | |
| "sampling/sampling_logp_difference/mean": 0.012124484404921532, | |
| "step": 323, | |
| "step_time": 37.114893339001355 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0683479537256062, | |
| "clip_ratio/high_mean": 0.022346383950207382, | |
| "clip_ratio/low_mean": 0.03898888279218227, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06133526662597433, | |
| "entropy": 0.2431696206331253, | |
| "epoch": 0.00648, | |
| "grad_norm": 0.35803645849227905, | |
| "kl": 0.767679963260889, | |
| "learning_rate": 9.999846401460026e-05, | |
| "loss": -0.0339, | |
| "step": 324, | |
| "step_time": 8.890772448001371 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015318243764340878, | |
| "clip_ratio/high_mean": 0.0038295609410852194, | |
| "clip_ratio/low_mean": 0.0023561508278362453, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006185711768921465, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1359.0, | |
| "completions/max_terminated_length": 1359.0, | |
| "completions/mean_length": 1233.796875, | |
| "completions/mean_terminated_length": 1233.796875, | |
| "completions/min_length": 663.0, | |
| "completions/min_terminated_length": 663.0, | |
| "entropy": 0.24552472867071629, | |
| "epoch": 0.0065, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7995591163635254, | |
| "kl": 0.8211700264364481, | |
| "learning_rate": 9.99984533295901e-05, | |
| "loss": -0.0057, | |
| "num_tokens": 17987636.0, | |
| "reward": 2.361278533935547, | |
| "reward_std": 11.01347541809082, | |
| "rewards/rollout_reward_func/mean": 2.361278533935547, | |
| "rewards/rollout_reward_func/std": 11.316116333007812, | |
| "sampling/importance_sampling_ratio/max": 1.4373282194137573, | |
| "sampling/importance_sampling_ratio/mean": 0.9916459321975708, | |
| "sampling/importance_sampling_ratio/min": 0.7290171384811401, | |
| "sampling/sampling_logp_difference/max": 0.3705787658691406, | |
| "sampling/sampling_logp_difference/mean": 0.01046331413090229, | |
| "step": 325, | |
| "step_time": 38.97714790000191 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05977182672359049, | |
| "clip_ratio/high_mean": 0.02233774628257379, | |
| "clip_ratio/low_mean": 0.027810412109829485, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05014815804315731, | |
| "entropy": 0.24062953237444162, | |
| "epoch": 0.00652, | |
| "grad_norm": 0.6732361316680908, | |
| "kl": 0.9134266618639231, | |
| "learning_rate": 9.999844260754451e-05, | |
| "loss": -0.011, | |
| "step": 326, | |
| "step_time": 9.702275648000068 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1343.0, | |
| "completions/max_terminated_length": 1343.0, | |
| "completions/mean_length": 1266.625, | |
| "completions/mean_terminated_length": 1266.625, | |
| "completions/min_length": 1005.0, | |
| "completions/min_terminated_length": 1005.0, | |
| "entropy": 0.19475865550339222, | |
| "epoch": 0.00654, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7081814408302307, | |
| "kl": 0.7645703088492155, | |
| "learning_rate": 9.999843184846354e-05, | |
| "loss": 0.0194, | |
| "num_tokens": 18120014.0, | |
| "reward": 6.7441205978393555, | |
| "reward_std": 12.950173377990723, | |
| "rewards/rollout_reward_func/mean": 6.7441205978393555, | |
| "rewards/rollout_reward_func/std": 13.17819881439209, | |
| "sampling/importance_sampling_ratio/max": 2.733876943588257, | |
| "sampling/importance_sampling_ratio/mean": 1.017435908317566, | |
| "sampling/importance_sampling_ratio/min": 0.8077232837677002, | |
| "sampling/sampling_logp_difference/max": 1.0649070739746094, | |
| "sampling/sampling_logp_difference/mean": 0.008961044251918793, | |
| "step": 327, | |
| "step_time": 38.45977644300001 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05813231039792299, | |
| "clip_ratio/high_mean": 0.0188276685657911, | |
| "clip_ratio/low_mean": 0.017785656382329762, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03661332529736683, | |
| "entropy": 0.18874722812324762, | |
| "epoch": 0.00656, | |
| "grad_norm": 0.29629969596862793, | |
| "kl": 0.7521160487085581, | |
| "learning_rate": 9.999842105234716e-05, | |
| "loss": 0.0089, | |
| "step": 328, | |
| "step_time": 9.240072366999811 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01736111124046147, | |
| "clip_ratio/high_mean": 0.004340277810115367, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004340277810115367, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1335.0, | |
| "completions/max_terminated_length": 1335.0, | |
| "completions/mean_length": 1223.046875, | |
| "completions/mean_terminated_length": 1223.046875, | |
| "completions/min_length": 1069.0, | |
| "completions/min_terminated_length": 1069.0, | |
| "entropy": 0.17775962874293327, | |
| "epoch": 0.00658, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5002116560935974, | |
| "kl": 0.5365802068263292, | |
| "learning_rate": 9.999841021919543e-05, | |
| "loss": -0.0003, | |
| "num_tokens": 18249422.0, | |
| "reward": 5.926024436950684, | |
| "reward_std": 10.913434028625488, | |
| "rewards/rollout_reward_func/mean": 5.926024436950684, | |
| "rewards/rollout_reward_func/std": 11.495051383972168, | |
| "sampling/importance_sampling_ratio/max": 1.340820550918579, | |
| "sampling/importance_sampling_ratio/mean": 0.9783110618591309, | |
| "sampling/importance_sampling_ratio/min": 0.5937914848327637, | |
| "sampling/sampling_logp_difference/max": 0.4624512195587158, | |
| "sampling/sampling_logp_difference/mean": 0.009367045015096664, | |
| "step": 329, | |
| "step_time": 40.06546166299813 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.049223856534808874, | |
| "clip_ratio/high_mean": 0.015778186498209834, | |
| "clip_ratio/low_mean": 0.02711397095117718, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04289215768221766, | |
| "entropy": 0.16470052115619183, | |
| "epoch": 0.0066, | |
| "grad_norm": 0.272549033164978, | |
| "kl": 0.5706925727427006, | |
| "learning_rate": 9.999839934900832e-05, | |
| "loss": -0.0098, | |
| "step": 330, | |
| "step_time": 9.584335595999619 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.006761695956811309, | |
| "clip_ratio/high_mean": 0.0016904239892028272, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016904239892028272, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1346.0, | |
| "completions/max_terminated_length": 1346.0, | |
| "completions/mean_length": 1230.453125, | |
| "completions/mean_terminated_length": 1230.453125, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "entropy": 0.15301176952198148, | |
| "epoch": 0.00662, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5951566696166992, | |
| "kl": 0.6225019320845604, | |
| "learning_rate": 9.999838844178584e-05, | |
| "loss": -0.0415, | |
| "num_tokens": 18379457.0, | |
| "reward": 5.441021919250488, | |
| "reward_std": 11.596078872680664, | |
| "rewards/rollout_reward_func/mean": 5.441021919250488, | |
| "rewards/rollout_reward_func/std": 13.130385398864746, | |
| "sampling/importance_sampling_ratio/max": 1.2981815338134766, | |
| "sampling/importance_sampling_ratio/mean": 0.9712120294570923, | |
| "sampling/importance_sampling_ratio/min": 0.5313878655433655, | |
| "sampling/sampling_logp_difference/max": 0.4391303062438965, | |
| "sampling/sampling_logp_difference/mean": 0.008532309904694557, | |
| "step": 331, | |
| "step_time": 38.03544032799982 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.040491855004802346, | |
| "clip_ratio/high_mean": 0.015382359270006418, | |
| "clip_ratio/low_mean": 0.02635878958972171, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04174114967463538, | |
| "entropy": 0.13200736604630947, | |
| "epoch": 0.00664, | |
| "grad_norm": 0.4791216552257538, | |
| "kl": 0.6594886407256126, | |
| "learning_rate": 9.999837749752803e-05, | |
| "loss": -0.0494, | |
| "step": 332, | |
| "step_time": 9.623436052000216 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0034722222480922937, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004340277810115367, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1344.0, | |
| "completions/max_terminated_length": 1344.0, | |
| "completions/mean_length": 1219.734375, | |
| "completions/mean_terminated_length": 1219.734375, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "entropy": 0.11573670757934451, | |
| "epoch": 0.00666, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6563036441802979, | |
| "kl": 0.851641334593296, | |
| "learning_rate": 9.999836651623487e-05, | |
| "loss": -0.0048, | |
| "num_tokens": 18508741.0, | |
| "reward": 5.738734245300293, | |
| "reward_std": 12.408971786499023, | |
| "rewards/rollout_reward_func/mean": 5.738734245300293, | |
| "rewards/rollout_reward_func/std": 12.671599388122559, | |
| "sampling/importance_sampling_ratio/max": 1.3496527671813965, | |
| "sampling/importance_sampling_ratio/mean": 1.0077811479568481, | |
| "sampling/importance_sampling_ratio/min": 0.6974970102310181, | |
| "sampling/sampling_logp_difference/max": 0.3328993320465088, | |
| "sampling/sampling_logp_difference/mean": 0.00713011808693409, | |
| "step": 333, | |
| "step_time": 39.30893147200186 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03472222248092294, | |
| "clip_ratio/high_mean": 0.011284722364507616, | |
| "clip_ratio/low_mean": 0.024913194763939828, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03619791695382446, | |
| "entropy": 0.11344034224748611, | |
| "epoch": 0.00668, | |
| "grad_norm": 0.4923565983772278, | |
| "kl": 0.6980615984648466, | |
| "learning_rate": 9.999835549790641e-05, | |
| "loss": -0.0079, | |
| "step": 334, | |
| "step_time": 10.117216201999327 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0036764706019312143, | |
| "clip_ratio/high_mean": 0.0009191176504828036, | |
| "clip_ratio/low_mean": 0.0034722222480922937, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004391339898575097, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1359.0, | |
| "completions/max_terminated_length": 1359.0, | |
| "completions/mean_length": 1238.984375, | |
| "completions/mean_terminated_length": 1238.984375, | |
| "completions/min_length": 635.0, | |
| "completions/min_terminated_length": 635.0, | |
| "entropy": 0.12856985442340374, | |
| "epoch": 0.0067, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8339588642120361, | |
| "kl": 0.6355916745960712, | |
| "learning_rate": 9.999834444254262e-05, | |
| "loss": -0.0042, | |
| "num_tokens": 18639311.0, | |
| "reward": 6.144355297088623, | |
| "reward_std": 13.870889663696289, | |
| "rewards/rollout_reward_func/mean": 6.144355297088623, | |
| "rewards/rollout_reward_func/std": 14.220029830932617, | |
| "sampling/importance_sampling_ratio/max": 1.3300856351852417, | |
| "sampling/importance_sampling_ratio/mean": 0.9924861788749695, | |
| "sampling/importance_sampling_ratio/min": 0.6479190587997437, | |
| "sampling/sampling_logp_difference/max": 0.2639361619949341, | |
| "sampling/sampling_logp_difference/mean": 0.006817285902798176, | |
| "step": 335, | |
| "step_time": 37.15376971599926 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03817401989363134, | |
| "clip_ratio/high_mean": 0.012147671717684716, | |
| "clip_ratio/low_mean": 0.020067402394488454, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03221507422858849, | |
| "entropy": 0.12425063038244843, | |
| "epoch": 0.00672, | |
| "grad_norm": 0.32718658447265625, | |
| "kl": 0.7219895403832197, | |
| "learning_rate": 9.999833335014352e-05, | |
| "loss": -0.011, | |
| "step": 336, | |
| "step_time": 9.780628326000624 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1356.0, | |
| "completions/max_terminated_length": 1356.0, | |
| "completions/mean_length": 1266.78125, | |
| "completions/mean_terminated_length": 1266.78125, | |
| "completions/min_length": 794.0, | |
| "completions/min_terminated_length": 794.0, | |
| "entropy": 0.11600295826792717, | |
| "epoch": 0.00674, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7365610003471375, | |
| "kl": 0.5392901804298162, | |
| "learning_rate": 9.999832222070914e-05, | |
| "loss": 0.0023, | |
| "num_tokens": 18771742.0, | |
| "reward": 5.880302429199219, | |
| "reward_std": 12.320051193237305, | |
| "rewards/rollout_reward_func/mean": 5.880302429199219, | |
| "rewards/rollout_reward_func/std": 12.716879844665527, | |
| "sampling/importance_sampling_ratio/max": 1.3457348346710205, | |
| "sampling/importance_sampling_ratio/mean": 0.9991644620895386, | |
| "sampling/importance_sampling_ratio/min": 0.6999140381813049, | |
| "sampling/sampling_logp_difference/max": 0.3562436103820801, | |
| "sampling/sampling_logp_difference/mean": 0.005911126732826233, | |
| "step": 337, | |
| "step_time": 38.84034024799803 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03513071918860078, | |
| "clip_ratio/high_mean": 0.011386846599634737, | |
| "clip_ratio/low_mean": 0.024994894862174988, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.036381741403602064, | |
| "entropy": 0.1102461889386177, | |
| "epoch": 0.00676, | |
| "grad_norm": 0.2775817811489105, | |
| "kl": 0.6620934028178453, | |
| "learning_rate": 9.999831105423947e-05, | |
| "loss": -0.006, | |
| "step": 338, | |
| "step_time": 9.023721004000436 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1350.0, | |
| "completions/max_terminated_length": 1350.0, | |
| "completions/mean_length": 1238.953125, | |
| "completions/mean_terminated_length": 1238.953125, | |
| "completions/min_length": 1062.0, | |
| "completions/min_terminated_length": 1062.0, | |
| "entropy": 0.11055759433656931, | |
| "epoch": 0.00678, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5181728601455688, | |
| "kl": 0.462925398722291, | |
| "learning_rate": 9.999829985073453e-05, | |
| "loss": 0.0105, | |
| "num_tokens": 18902239.0, | |
| "reward": 7.53302526473999, | |
| "reward_std": 12.4171142578125, | |
| "rewards/rollout_reward_func/mean": 7.533025741577148, | |
| "rewards/rollout_reward_func/std": 13.036537170410156, | |
| "sampling/importance_sampling_ratio/max": 1.3853559494018555, | |
| "sampling/importance_sampling_ratio/mean": 1.000986933708191, | |
| "sampling/importance_sampling_ratio/min": 0.702711284160614, | |
| "sampling/sampling_logp_difference/max": 0.4794572591781616, | |
| "sampling/sampling_logp_difference/mean": 0.00590522913262248, | |
| "step": 339, | |
| "step_time": 39.08791527499943 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.039215686498209834, | |
| "clip_ratio/high_mean": 0.013276143989060074, | |
| "clip_ratio/low_mean": 0.02185995056061074, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03513609484070912, | |
| "entropy": 0.11709691304713488, | |
| "epoch": 0.0068, | |
| "grad_norm": 0.30310577154159546, | |
| "kl": 0.5310502368956804, | |
| "learning_rate": 9.999828861019435e-05, | |
| "loss": 0.006, | |
| "step": 340, | |
| "step_time": 9.792470953999327 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0034722222480922937, | |
| "clip_ratio/low_mean": 0.0009191176504828036, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004391339898575097, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1347.0, | |
| "completions/max_terminated_length": 1347.0, | |
| "completions/mean_length": 1241.765625, | |
| "completions/mean_terminated_length": 1240.635009765625, | |
| "completions/min_length": 1101.0, | |
| "completions/min_terminated_length": 1101.0, | |
| "entropy": 0.12758585345000029, | |
| "epoch": 0.00682, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6102613210678101, | |
| "kl": 0.6113391723483801, | |
| "learning_rate": 9.99982773326189e-05, | |
| "loss": 0.0158, | |
| "num_tokens": 19032925.0, | |
| "reward": 3.9826180934906006, | |
| "reward_std": 12.427906036376953, | |
| "rewards/rollout_reward_func/mean": 3.9826183319091797, | |
| "rewards/rollout_reward_func/std": 13.354879379272461, | |
| "sampling/importance_sampling_ratio/max": 1.1800951957702637, | |
| "sampling/importance_sampling_ratio/mean": 0.997043251991272, | |
| "sampling/importance_sampling_ratio/min": 0.7389504313468933, | |
| "sampling/sampling_logp_difference/max": 0.2936210632324219, | |
| "sampling/sampling_logp_difference/mean": 0.005317248869687319, | |
| "step": 341, | |
| "step_time": 39.12008871799935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021037581842392683, | |
| "clip_ratio/high_mean": 0.006995506584644318, | |
| "clip_ratio/low_mean": 0.016595179855357856, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023590686498209834, | |
| "entropy": 0.12726877955719829, | |
| "epoch": 0.00684, | |
| "grad_norm": 0.49857455492019653, | |
| "kl": 0.6323374789208174, | |
| "learning_rate": 9.999826601800824e-05, | |
| "loss": 0.0106, | |
| "step": 342, | |
| "step_time": 9.27907708300063 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0008680555620230734, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1341.0, | |
| "completions/max_terminated_length": 1341.0, | |
| "completions/mean_length": 1219.203125, | |
| "completions/mean_terminated_length": 1219.203125, | |
| "completions/min_length": 735.0, | |
| "completions/min_terminated_length": 735.0, | |
| "entropy": 0.11009268835186958, | |
| "epoch": 0.00686, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6059837341308594, | |
| "kl": 0.7338532544672489, | |
| "learning_rate": 9.999825466636233e-05, | |
| "loss": -0.0167, | |
| "num_tokens": 19162127.0, | |
| "reward": 4.720416069030762, | |
| "reward_std": 10.753931999206543, | |
| "rewards/rollout_reward_func/mean": 4.720416069030762, | |
| "rewards/rollout_reward_func/std": 12.976871490478516, | |
| "sampling/importance_sampling_ratio/max": 1.5183767080307007, | |
| "sampling/importance_sampling_ratio/mean": 1.0038487911224365, | |
| "sampling/importance_sampling_ratio/min": 0.6935895681381226, | |
| "sampling/sampling_logp_difference/max": 0.4249706268310547, | |
| "sampling/sampling_logp_difference/mean": 0.004785279743373394, | |
| "step": 343, | |
| "step_time": 39.04033868700208 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02798202633857727, | |
| "clip_ratio/high_mean": 0.012203840189613402, | |
| "clip_ratio/low_mean": 0.013071895577013493, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025275735824834555, | |
| "entropy": 0.11364737106487155, | |
| "epoch": 0.00688, | |
| "grad_norm": 0.2909540832042694, | |
| "kl": 0.7444342169910669, | |
| "learning_rate": 9.999824327768122e-05, | |
| "loss": -0.0205, | |
| "step": 344, | |
| "step_time": 9.620514073000777 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1359.0, | |
| "completions/max_terminated_length": 1359.0, | |
| "completions/mean_length": 1240.3125, | |
| "completions/mean_terminated_length": 1240.3125, | |
| "completions/min_length": 1088.0, | |
| "completions/min_terminated_length": 1088.0, | |
| "entropy": 0.13259067060425878, | |
| "epoch": 0.0069, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8596341609954834, | |
| "kl": 0.7895576078444719, | |
| "learning_rate": 9.99982318519649e-05, | |
| "loss": -0.0057, | |
| "num_tokens": 19292776.0, | |
| "reward": 2.67919659614563, | |
| "reward_std": 14.777613639831543, | |
| "rewards/rollout_reward_func/mean": 2.679196357727051, | |
| "rewards/rollout_reward_func/std": 15.276268005371094, | |
| "sampling/importance_sampling_ratio/max": 1.4407294988632202, | |
| "sampling/importance_sampling_ratio/mean": 0.9704160690307617, | |
| "sampling/importance_sampling_ratio/min": 0.6675639152526855, | |
| "sampling/sampling_logp_difference/max": 0.4319186210632324, | |
| "sampling/sampling_logp_difference/mean": 0.007419218309223652, | |
| "step": 345, | |
| "step_time": 38.83812410200153 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03472222248092294, | |
| "clip_ratio/high_mean": 0.009548611182253808, | |
| "clip_ratio/low_mean": 0.03599877539090812, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.045547386282123625, | |
| "entropy": 0.1278433846309781, | |
| "epoch": 0.00692, | |
| "grad_norm": 0.6303772330284119, | |
| "kl": 1.1542848944664001, | |
| "learning_rate": 9.999822038921338e-05, | |
| "loss": -0.0049, | |
| "step": 346, | |
| "step_time": 9.405012558002454 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0025584796094335616, | |
| "clip_ratio/low_mean": 0.0026041666860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005162646295502782, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1353.0, | |
| "completions/max_terminated_length": 1353.0, | |
| "completions/mean_length": 1241.171875, | |
| "completions/mean_terminated_length": 1241.171875, | |
| "completions/min_length": 719.0, | |
| "completions/min_terminated_length": 719.0, | |
| "entropy": 0.1106796741951257, | |
| "epoch": 0.00694, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4797411561012268, | |
| "kl": 0.6278085261583328, | |
| "learning_rate": 9.99982088894267e-05, | |
| "loss": 0.0106, | |
| "num_tokens": 19423491.0, | |
| "reward": 5.54637336730957, | |
| "reward_std": 12.041938781738281, | |
| "rewards/rollout_reward_func/mean": 5.54637336730957, | |
| "rewards/rollout_reward_func/std": 13.066041946411133, | |
| "sampling/importance_sampling_ratio/max": 1.482460618019104, | |
| "sampling/importance_sampling_ratio/mean": 0.9943655133247375, | |
| "sampling/importance_sampling_ratio/min": 0.6257169246673584, | |
| "sampling/sampling_logp_difference/max": 0.510839581489563, | |
| "sampling/sampling_logp_difference/mean": 0.006749512627720833, | |
| "step": 347, | |
| "step_time": 39.46550670700071 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05993883335031569, | |
| "clip_ratio/high_mean": 0.015852763841394335, | |
| "clip_ratio/low_mean": 0.02315665892092511, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0390094225294888, | |
| "entropy": 0.11026378069072962, | |
| "epoch": 0.00696, | |
| "grad_norm": 0.3181508183479309, | |
| "kl": 0.6370288580656052, | |
| "learning_rate": 9.999819735260483e-05, | |
| "loss": 0.0068, | |
| "step": 348, | |
| "step_time": 10.100482684999406 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.002517361135687679, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002517361135687679, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1332.0, | |
| "completions/max_terminated_length": 1332.0, | |
| "completions/mean_length": 1208.171875, | |
| "completions/mean_terminated_length": 1208.171875, | |
| "completions/min_length": 189.0, | |
| "completions/min_terminated_length": 189.0, | |
| "entropy": 0.13927970174700022, | |
| "epoch": 0.00698, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5229349732398987, | |
| "kl": 0.5507344976067543, | |
| "learning_rate": 9.999818577874781e-05, | |
| "loss": 0.0234, | |
| "num_tokens": 19552011.0, | |
| "reward": 5.104192733764648, | |
| "reward_std": 11.615788459777832, | |
| "rewards/rollout_reward_func/mean": 5.104192733764648, | |
| "rewards/rollout_reward_func/std": 12.1382474899292, | |
| "sampling/importance_sampling_ratio/max": 1.415814995765686, | |
| "sampling/importance_sampling_ratio/mean": 1.0048539638519287, | |
| "sampling/importance_sampling_ratio/min": 1.834242700438695e-16, | |
| "sampling/sampling_logp_difference/max": 27.188508987426758, | |
| "sampling/sampling_logp_difference/mean": 0.039306361228227615, | |
| "step": 349, | |
| "step_time": 37.862728561997756 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05868378118611872, | |
| "clip_ratio/high_mean": 0.017275112157221884, | |
| "clip_ratio/low_mean": 0.013766340038273484, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03104145231191069, | |
| "entropy": 0.14939681394025683, | |
| "epoch": 0.007, | |
| "grad_norm": 0.3000262379646301, | |
| "kl": 0.5239376667886972, | |
| "learning_rate": 9.999817416785565e-05, | |
| "loss": 0.0173, | |
| "step": 350, | |
| "step_time": 9.859687822999149 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013706140452995896, | |
| "clip_ratio/high_mean": 0.003426535113248974, | |
| "clip_ratio/low_mean": 0.0026041666860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006030701799318194, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1342.0, | |
| "completions/max_terminated_length": 1342.0, | |
| "completions/mean_length": 1236.90625, | |
| "completions/mean_terminated_length": 1236.90625, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "entropy": 0.14154944382607937, | |
| "epoch": 0.00702, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.47570475935935974, | |
| "kl": 0.5030098669230938, | |
| "learning_rate": 9.999816251992836e-05, | |
| "loss": -0.0158, | |
| "num_tokens": 19682494.0, | |
| "reward": 4.190635681152344, | |
| "reward_std": 14.216930389404297, | |
| "rewards/rollout_reward_func/mean": 4.190635681152344, | |
| "rewards/rollout_reward_func/std": 14.30445671081543, | |
| "sampling/importance_sampling_ratio/max": 1.5223360061645508, | |
| "sampling/importance_sampling_ratio/mean": 1.0101966857910156, | |
| "sampling/importance_sampling_ratio/min": 0.7218723297119141, | |
| "sampling/sampling_logp_difference/max": 0.302712082862854, | |
| "sampling/sampling_logp_difference/mean": 0.007096399553120136, | |
| "step": 351, | |
| "step_time": 39.30754639200131 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03492647083476186, | |
| "clip_ratio/high_mean": 0.013026208442170173, | |
| "clip_ratio/low_mean": 0.018280228949151933, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.031306437274906784, | |
| "entropy": 0.1433765795081854, | |
| "epoch": 0.00704, | |
| "grad_norm": 0.2731061577796936, | |
| "kl": 0.5208645444363356, | |
| "learning_rate": 9.999815083496594e-05, | |
| "loss": -0.0214, | |
| "step": 352, | |
| "step_time": 9.398018900999887 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0034722222480922937, | |
| "clip_ratio/low_mean": 0.0034722222480922937, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0069444444961845875, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1348.0, | |
| "completions/max_terminated_length": 1348.0, | |
| "completions/mean_length": 1237.3125, | |
| "completions/mean_terminated_length": 1237.3125, | |
| "completions/min_length": 181.0, | |
| "completions/min_terminated_length": 181.0, | |
| "entropy": 0.1480951178818941, | |
| "epoch": 0.00706, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5071465373039246, | |
| "kl": 0.5567373130470514, | |
| "learning_rate": 9.99981391129684e-05, | |
| "loss": -0.008, | |
| "num_tokens": 19812942.0, | |
| "reward": 4.355041980743408, | |
| "reward_std": 13.132366180419922, | |
| "rewards/rollout_reward_func/mean": 4.355041980743408, | |
| "rewards/rollout_reward_func/std": 13.851308822631836, | |
| "sampling/importance_sampling_ratio/max": 1.4629567861557007, | |
| "sampling/importance_sampling_ratio/mean": 1.0300343036651611, | |
| "sampling/importance_sampling_ratio/min": 0.6640676856040955, | |
| "sampling/sampling_logp_difference/max": 0.5005507469177246, | |
| "sampling/sampling_logp_difference/mean": 0.007855242118239403, | |
| "step": 353, | |
| "step_time": 38.34223270599978 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024305555736646056, | |
| "clip_ratio/high_mean": 0.007812500116415322, | |
| "clip_ratio/low_mean": 0.024994894512929022, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.032807394745759666, | |
| "entropy": 0.13850665464997292, | |
| "epoch": 0.00708, | |
| "grad_norm": 0.27875232696533203, | |
| "kl": 0.5900795683264732, | |
| "learning_rate": 9.999812735393576e-05, | |
| "loss": -0.0167, | |
| "step": 354, | |
| "step_time": 9.843489302002126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666744276881, | |
| "clip_ratio/high_mean": 0.0026041666860692203, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1358.0, | |
| "completions/max_terminated_length": 1358.0, | |
| "completions/mean_length": 1228.9375, | |
| "completions/mean_terminated_length": 1228.9375, | |
| "completions/min_length": 699.0, | |
| "completions/min_terminated_length": 699.0, | |
| "entropy": 0.13509350316599011, | |
| "epoch": 0.0071, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5068008899688721, | |
| "kl": 0.4818702656775713, | |
| "learning_rate": 9.999811555786804e-05, | |
| "loss": 0.0278, | |
| "num_tokens": 19942820.0, | |
| "reward": 5.804719924926758, | |
| "reward_std": 13.167655944824219, | |
| "rewards/rollout_reward_func/mean": 5.804719924926758, | |
| "rewards/rollout_reward_func/std": 13.18018913269043, | |
| "sampling/importance_sampling_ratio/max": 1.306014895439148, | |
| "sampling/importance_sampling_ratio/mean": 1.0091025829315186, | |
| "sampling/importance_sampling_ratio/min": 0.625792384147644, | |
| "sampling/sampling_logp_difference/max": 0.3519221544265747, | |
| "sampling/sampling_logp_difference/mean": 0.006864185445010662, | |
| "step": 355, | |
| "step_time": 39.10083819900228 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.031250000232830644, | |
| "clip_ratio/high_mean": 0.013888889225199819, | |
| "clip_ratio/low_mean": 0.026143791212234646, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.040032680495642126, | |
| "entropy": 0.1260006483644247, | |
| "epoch": 0.00712, | |
| "grad_norm": 0.2814493179321289, | |
| "kl": 0.5503856968134642, | |
| "learning_rate": 9.999810372476525e-05, | |
| "loss": 0.0244, | |
| "step": 356, | |
| "step_time": 9.433313735999036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1365.0, | |
| "completions/max_terminated_length": 1365.0, | |
| "completions/mean_length": 1269.96875, | |
| "completions/mean_terminated_length": 1269.96875, | |
| "completions/min_length": 1127.0, | |
| "completions/min_terminated_length": 1127.0, | |
| "entropy": 0.1161547633819282, | |
| "epoch": 0.00714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4718828499317169, | |
| "kl": 0.9211016893386841, | |
| "learning_rate": 9.999809185462739e-05, | |
| "loss": 0.039, | |
| "num_tokens": 20075371.0, | |
| "reward": 3.961371898651123, | |
| "reward_std": 11.789936065673828, | |
| "rewards/rollout_reward_func/mean": 3.961371898651123, | |
| "rewards/rollout_reward_func/std": 12.59416675567627, | |
| "sampling/importance_sampling_ratio/max": 1.223926067352295, | |
| "sampling/importance_sampling_ratio/mean": 0.9972316026687622, | |
| "sampling/importance_sampling_ratio/min": 0.7068163156509399, | |
| "sampling/sampling_logp_difference/max": 0.21517443656921387, | |
| "sampling/sampling_logp_difference/mean": 0.0053621698170900345, | |
| "step": 357, | |
| "step_time": 38.67136614899937 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.049019608180969954, | |
| "clip_ratio/high_mean": 0.013991013227496296, | |
| "clip_ratio/low_mean": 0.01996527804294601, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03395629138685763, | |
| "entropy": 0.11733251390978694, | |
| "epoch": 0.00716, | |
| "grad_norm": 0.16502372920513153, | |
| "kl": 0.7801671754568815, | |
| "learning_rate": 9.999807994745449e-05, | |
| "loss": 0.0324, | |
| "step": 358, | |
| "step_time": 9.794801944999563 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1343.0, | |
| "completions/max_terminated_length": 1343.0, | |
| "completions/mean_length": 1256.96875, | |
| "completions/mean_terminated_length": 1256.96875, | |
| "completions/min_length": 1011.0, | |
| "completions/min_terminated_length": 1011.0, | |
| "entropy": 0.13261962542310357, | |
| "epoch": 0.00718, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4223484992980957, | |
| "kl": 0.6454224642366171, | |
| "learning_rate": 9.999806800324652e-05, | |
| "loss": -0.0021, | |
| "num_tokens": 20207093.0, | |
| "reward": 4.6267523765563965, | |
| "reward_std": 13.086963653564453, | |
| "rewards/rollout_reward_func/mean": 4.626751899719238, | |
| "rewards/rollout_reward_func/std": 14.676898956298828, | |
| "sampling/importance_sampling_ratio/max": 1.336045265197754, | |
| "sampling/importance_sampling_ratio/mean": 0.9978616237640381, | |
| "sampling/importance_sampling_ratio/min": 0.6580431461334229, | |
| "sampling/sampling_logp_difference/max": 0.287054181098938, | |
| "sampling/sampling_logp_difference/mean": 0.0057748714461922646, | |
| "step": 359, | |
| "step_time": 38.74908411499746 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.058662281604483724, | |
| "clip_ratio/high_mean": 0.018137792707420886, | |
| "clip_ratio/low_mean": 0.018183479725848883, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03632127266610041, | |
| "entropy": 0.1370791387744248, | |
| "epoch": 0.0072, | |
| "grad_norm": 0.26030489802360535, | |
| "kl": 0.6394520290195942, | |
| "learning_rate": 9.999805602200354e-05, | |
| "loss": -0.0085, | |
| "step": 360, | |
| "step_time": 9.32019592799952 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0008680555620230734, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1344.0, | |
| "completions/max_terminated_length": 1344.0, | |
| "completions/mean_length": 1241.734375, | |
| "completions/mean_terminated_length": 1241.734375, | |
| "completions/min_length": 1025.0, | |
| "completions/min_terminated_length": 1025.0, | |
| "entropy": 0.1345509896054864, | |
| "epoch": 0.00722, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5100898742675781, | |
| "kl": 0.8387140035629272, | |
| "learning_rate": 9.999804400372554e-05, | |
| "loss": 0.007, | |
| "num_tokens": 20337789.0, | |
| "reward": 9.449151039123535, | |
| "reward_std": 12.286431312561035, | |
| "rewards/rollout_reward_func/mean": 9.449151039123535, | |
| "rewards/rollout_reward_func/std": 13.57576847076416, | |
| "sampling/importance_sampling_ratio/max": 1.4115562438964844, | |
| "sampling/importance_sampling_ratio/mean": 0.990313708782196, | |
| "sampling/importance_sampling_ratio/min": 0.6950281858444214, | |
| "sampling/sampling_logp_difference/max": 0.3388124704360962, | |
| "sampling/sampling_logp_difference/mean": 0.00530852098017931, | |
| "step": 361, | |
| "step_time": 39.235169910002696 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0349264710675925, | |
| "clip_ratio/high_mean": 0.010467728832736611, | |
| "clip_ratio/low_mean": 0.01741217344533652, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.027879902394488454, | |
| "entropy": 0.15170079609379172, | |
| "epoch": 0.00724, | |
| "grad_norm": 0.33363601565361023, | |
| "kl": 0.6247174255549908, | |
| "learning_rate": 9.999803194841253e-05, | |
| "loss": 0.0003, | |
| "step": 362, | |
| "step_time": 9.35076707999906 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1359.0, | |
| "completions/max_terminated_length": 1359.0, | |
| "completions/mean_length": 1251.59375, | |
| "completions/mean_terminated_length": 1251.59375, | |
| "completions/min_length": 700.0, | |
| "completions/min_terminated_length": 700.0, | |
| "entropy": 0.1958311009220779, | |
| "epoch": 0.00726, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5317199230194092, | |
| "kl": 0.5815525501966476, | |
| "learning_rate": 9.999801985606452e-05, | |
| "loss": 0.0042, | |
| "num_tokens": 20469218.0, | |
| "reward": 3.8903188705444336, | |
| "reward_std": 13.076482772827148, | |
| "rewards/rollout_reward_func/mean": 3.8903186321258545, | |
| "rewards/rollout_reward_func/std": 13.372103691101074, | |
| "sampling/importance_sampling_ratio/max": 1.3623380661010742, | |
| "sampling/importance_sampling_ratio/mean": 1.0135592222213745, | |
| "sampling/importance_sampling_ratio/min": 0.7123748064041138, | |
| "sampling/sampling_logp_difference/max": 0.29522740840911865, | |
| "sampling/sampling_logp_difference/mean": 0.006953438278287649, | |
| "step": 363, | |
| "step_time": 39.74288477299888 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06527777831070125, | |
| "clip_ratio/high_mean": 0.01979166700039059, | |
| "clip_ratio/low_mean": 0.021701389166992158, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04149305640021339, | |
| "entropy": 0.2018888248130679, | |
| "epoch": 0.00728, | |
| "grad_norm": 0.2644880712032318, | |
| "kl": 0.5717838387936354, | |
| "learning_rate": 9.999800772668153e-05, | |
| "loss": -0.0029, | |
| "step": 364, | |
| "step_time": 9.321073625997087 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041666860692203, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1354.0, | |
| "completions/max_terminated_length": 1354.0, | |
| "completions/mean_length": 1249.453125, | |
| "completions/mean_terminated_length": 1249.453125, | |
| "completions/min_length": 467.0, | |
| "completions/min_terminated_length": 467.0, | |
| "entropy": 0.18510928004980087, | |
| "epoch": 0.0073, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.442364364862442, | |
| "kl": 0.4795332048088312, | |
| "learning_rate": 9.999799556026358e-05, | |
| "loss": -0.0238, | |
| "num_tokens": 20600462.0, | |
| "reward": 6.273903846740723, | |
| "reward_std": 12.39173698425293, | |
| "rewards/rollout_reward_func/mean": 6.273903846740723, | |
| "rewards/rollout_reward_func/std": 13.681985855102539, | |
| "sampling/importance_sampling_ratio/max": 1.3438879251480103, | |
| "sampling/importance_sampling_ratio/mean": 0.9609812498092651, | |
| "sampling/importance_sampling_ratio/min": 0.6316797733306885, | |
| "sampling/sampling_logp_difference/max": 0.33423590660095215, | |
| "sampling/sampling_logp_difference/mean": 0.007498072925955057, | |
| "step": 365, | |
| "step_time": 38.43022035099784 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05171783687546849, | |
| "clip_ratio/high_mean": 0.013797514839097857, | |
| "clip_ratio/low_mean": 0.007766812981572002, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0215643277624622, | |
| "entropy": 0.18268039543181658, | |
| "epoch": 0.00732, | |
| "grad_norm": 0.2666545808315277, | |
| "kl": 0.47542588133364916, | |
| "learning_rate": 9.999798335681066e-05, | |
| "loss": -0.0309, | |
| "step": 366, | |
| "step_time": 9.454387761999897 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0016904239892028272, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016904239892028272, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1344.0, | |
| "completions/max_terminated_length": 1344.0, | |
| "completions/mean_length": 1222.125, | |
| "completions/mean_terminated_length": 1222.125, | |
| "completions/min_length": 999.0, | |
| "completions/min_terminated_length": 999.0, | |
| "entropy": 0.21282331459224224, | |
| "epoch": 0.00734, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8037834763526917, | |
| "kl": 0.6722489278763533, | |
| "learning_rate": 9.99979711163228e-05, | |
| "loss": 0.0148, | |
| "num_tokens": 20729886.0, | |
| "reward": 5.174856662750244, | |
| "reward_std": 11.355770111083984, | |
| "rewards/rollout_reward_func/mean": 5.174857139587402, | |
| "rewards/rollout_reward_func/std": 11.9678955078125, | |
| "sampling/importance_sampling_ratio/max": 1.8758124113082886, | |
| "sampling/importance_sampling_ratio/mean": 1.0103557109832764, | |
| "sampling/importance_sampling_ratio/min": 0.7285647392272949, | |
| "sampling/sampling_logp_difference/max": 0.3263084888458252, | |
| "sampling/sampling_logp_difference/mean": 0.008695240132510662, | |
| "step": 367, | |
| "step_time": 37.67410640100388 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.036011905409395695, | |
| "clip_ratio/high_mean": 0.01073908741818741, | |
| "clip_ratio/low_mean": 0.02561856439569965, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0363576520467177, | |
| "entropy": 0.21391737554222345, | |
| "epoch": 0.00736, | |
| "grad_norm": 0.4400097727775574, | |
| "kl": 0.749302851036191, | |
| "learning_rate": 9.999795883880001e-05, | |
| "loss": 0.005, | |
| "step": 368, | |
| "step_time": 9.906740201999128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.003426535113248974, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003426535113248974, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1348.0, | |
| "completions/max_terminated_length": 1348.0, | |
| "completions/mean_length": 1231.171875, | |
| "completions/mean_terminated_length": 1231.171875, | |
| "completions/min_length": 994.0, | |
| "completions/min_terminated_length": 994.0, | |
| "entropy": 0.2234082594513893, | |
| "epoch": 0.00738, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.0522061586380005, | |
| "kl": 0.9143509455025196, | |
| "learning_rate": 9.999794652424228e-05, | |
| "loss": 0.0039, | |
| "num_tokens": 20859908.0, | |
| "reward": 8.738959312438965, | |
| "reward_std": 11.845466613769531, | |
| "rewards/rollout_reward_func/mean": 8.738959312438965, | |
| "rewards/rollout_reward_func/std": 12.123114585876465, | |
| "sampling/importance_sampling_ratio/max": 1.3250545263290405, | |
| "sampling/importance_sampling_ratio/mean": 1.020609974861145, | |
| "sampling/importance_sampling_ratio/min": 0.5885343551635742, | |
| "sampling/sampling_logp_difference/max": 0.4251088500022888, | |
| "sampling/sampling_logp_difference/mean": 0.008847212418913841, | |
| "step": 369, | |
| "step_time": 38.576368669004296 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05455874605104327, | |
| "clip_ratio/high_mean": 0.016198166005779058, | |
| "clip_ratio/low_mean": 0.03593064745655283, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05212881352053955, | |
| "entropy": 0.22893889155238867, | |
| "epoch": 0.0074, | |
| "grad_norm": 0.39881831407546997, | |
| "kl": 1.0176227018237114, | |
| "learning_rate": 9.999793417264966e-05, | |
| "loss": -0.0017, | |
| "step": 370, | |
| "step_time": 8.888960126005259 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1348.0, | |
| "completions/max_terminated_length": 1348.0, | |
| "completions/mean_length": 1242.28125, | |
| "completions/mean_terminated_length": 1242.28125, | |
| "completions/min_length": 913.0, | |
| "completions/min_terminated_length": 913.0, | |
| "entropy": 0.2308051260188222, | |
| "epoch": 0.00742, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5715855956077576, | |
| "kl": 0.87254199385643, | |
| "learning_rate": 9.999792178402214e-05, | |
| "loss": -0.0234, | |
| "num_tokens": 20990697.0, | |
| "reward": 6.123772621154785, | |
| "reward_std": 10.485084533691406, | |
| "rewards/rollout_reward_func/mean": 6.123772144317627, | |
| "rewards/rollout_reward_func/std": 11.30632209777832, | |
| "sampling/importance_sampling_ratio/max": 1.4539350271224976, | |
| "sampling/importance_sampling_ratio/mean": 1.0005998611450195, | |
| "sampling/importance_sampling_ratio/min": 0.5505736470222473, | |
| "sampling/sampling_logp_difference/max": 0.3510777950286865, | |
| "sampling/sampling_logp_difference/mean": 0.009203520603477955, | |
| "step": 371, | |
| "step_time": 38.65458783400027 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.045200163731351495, | |
| "clip_ratio/high_mean": 0.013036152173299342, | |
| "clip_ratio/low_mean": 0.023381332110147923, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03641748463269323, | |
| "entropy": 0.2447242382913828, | |
| "epoch": 0.00744, | |
| "grad_norm": 0.298229455947876, | |
| "kl": 0.8313354179263115, | |
| "learning_rate": 9.999790935835973e-05, | |
| "loss": -0.0303, | |
| "step": 372, | |
| "step_time": 9.79756171700501 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010051169665530324, | |
| "clip_ratio/high_mean": 0.002512792416382581, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0033808479784056544, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1358.0, | |
| "completions/max_terminated_length": 1358.0, | |
| "completions/mean_length": 1223.75, | |
| "completions/mean_terminated_length": 1223.75, | |
| "completions/min_length": 908.0, | |
| "completions/min_terminated_length": 908.0, | |
| "entropy": 0.25220474135130644, | |
| "epoch": 0.00746, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5470872521400452, | |
| "kl": 0.7501334678381681, | |
| "learning_rate": 9.999789689566245e-05, | |
| "loss": 0.0016, | |
| "num_tokens": 21120250.0, | |
| "reward": 4.980414867401123, | |
| "reward_std": 13.811859130859375, | |
| "rewards/rollout_reward_func/mean": 4.980414867401123, | |
| "rewards/rollout_reward_func/std": 15.705443382263184, | |
| "sampling/importance_sampling_ratio/max": 1.4358227252960205, | |
| "sampling/importance_sampling_ratio/mean": 0.9660643339157104, | |
| "sampling/importance_sampling_ratio/min": 0.4660157859325409, | |
| "sampling/sampling_logp_difference/max": 0.5715584754943848, | |
| "sampling/sampling_logp_difference/mean": 0.011051887646317482, | |
| "step": 373, | |
| "step_time": 37.840678287995615 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07236842135898769, | |
| "clip_ratio/high_mean": 0.021564327646046877, | |
| "clip_ratio/low_mean": 0.025087612215429544, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04665194044355303, | |
| "entropy": 0.2590667102485895, | |
| "epoch": 0.00748, | |
| "grad_norm": 0.33827197551727295, | |
| "kl": 0.7311984747648239, | |
| "learning_rate": 9.999788439593031e-05, | |
| "loss": -0.0111, | |
| "step": 374, | |
| "step_time": 8.941922394003996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0008680555620230734, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1351.0, | |
| "completions/max_terminated_length": 1351.0, | |
| "completions/mean_length": 1216.34375, | |
| "completions/mean_terminated_length": 1216.34375, | |
| "completions/min_length": 993.0, | |
| "completions/min_terminated_length": 993.0, | |
| "entropy": 0.2550716269761324, | |
| "epoch": 0.0075, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8335476517677307, | |
| "kl": 0.9307033438235521, | |
| "learning_rate": 9.999787185916331e-05, | |
| "loss": 0.0294, | |
| "num_tokens": 21249311.0, | |
| "reward": 5.41689920425415, | |
| "reward_std": 12.388166427612305, | |
| "rewards/rollout_reward_func/mean": 5.416898727416992, | |
| "rewards/rollout_reward_func/std": 13.267603874206543, | |
| "sampling/importance_sampling_ratio/max": 1.4684741497039795, | |
| "sampling/importance_sampling_ratio/mean": 1.0054916143417358, | |
| "sampling/importance_sampling_ratio/min": 0.6331810355186462, | |
| "sampling/sampling_logp_difference/max": 0.2799299955368042, | |
| "sampling/sampling_logp_difference/mean": 0.01017037034034729, | |
| "step": 375, | |
| "step_time": 38.13824768100312 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06950894417241216, | |
| "clip_ratio/high_mean": 0.021580452797934413, | |
| "clip_ratio/low_mean": 0.02794391370844096, | |
| "clip_ratio/low_min": 0.0034722222480922937, | |
| "clip_ratio/region_mean": 0.049524366680998355, | |
| "entropy": 0.2559032328426838, | |
| "epoch": 0.00752, | |
| "grad_norm": 0.2704547047615051, | |
| "kl": 0.9575543515384197, | |
| "learning_rate": 9.999785928536148e-05, | |
| "loss": 0.0164, | |
| "step": 376, | |
| "step_time": 9.173922988000413 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00657894741743803, | |
| "clip_ratio/high_mean": 0.0016447368543595076, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016447368543595076, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1330.0, | |
| "completions/max_terminated_length": 1330.0, | |
| "completions/mean_length": 1201.6875, | |
| "completions/mean_terminated_length": 1201.6875, | |
| "completions/min_length": 1002.0, | |
| "completions/min_terminated_length": 1002.0, | |
| "entropy": 0.2529036393389106, | |
| "epoch": 0.00754, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5121884346008301, | |
| "kl": 0.7823121659457684, | |
| "learning_rate": 9.999784667452484e-05, | |
| "loss": -0.0058, | |
| "num_tokens": 21377388.0, | |
| "reward": 5.937844753265381, | |
| "reward_std": 10.75206184387207, | |
| "rewards/rollout_reward_func/mean": 5.937845230102539, | |
| "rewards/rollout_reward_func/std": 10.630824089050293, | |
| "sampling/importance_sampling_ratio/max": 1.2855048179626465, | |
| "sampling/importance_sampling_ratio/mean": 0.9824115037918091, | |
| "sampling/importance_sampling_ratio/min": 0.7048435807228088, | |
| "sampling/sampling_logp_difference/max": 0.3823585510253906, | |
| "sampling/sampling_logp_difference/mean": 0.010149901732802391, | |
| "step": 377, | |
| "step_time": 37.091166489997704 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.030701754614710808, | |
| "clip_ratio/high_mean": 0.011025112122297287, | |
| "clip_ratio/low_mean": 0.024379125621635467, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03540423803497106, | |
| "entropy": 0.24686269089579582, | |
| "epoch": 0.00756, | |
| "grad_norm": 0.308444082736969, | |
| "kl": 0.7720872350037098, | |
| "learning_rate": 9.999783402665338e-05, | |
| "loss": -0.0141, | |
| "step": 378, | |
| "step_time": 8.784612373994605 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01686507952399552, | |
| "clip_ratio/high_mean": 0.00421626988099888, | |
| "clip_ratio/low_mean": 0.0035807291860692203, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0077969990670681, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1343.0, | |
| "completions/max_terminated_length": 1343.0, | |
| "completions/mean_length": 1210.0625, | |
| "completions/mean_terminated_length": 1210.0625, | |
| "completions/min_length": 720.0, | |
| "completions/min_terminated_length": 720.0, | |
| "entropy": 0.2716317633166909, | |
| "epoch": 0.00758, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6121698021888733, | |
| "kl": 0.925221860408783, | |
| "learning_rate": 9.999782134174711e-05, | |
| "loss": -0.0013, | |
| "num_tokens": 21506045.0, | |
| "reward": 2.2015371322631836, | |
| "reward_std": 15.299311637878418, | |
| "rewards/rollout_reward_func/mean": 2.2015371322631836, | |
| "rewards/rollout_reward_func/std": 15.50017261505127, | |
| "sampling/importance_sampling_ratio/max": 1.400822639465332, | |
| "sampling/importance_sampling_ratio/mean": 0.9883875846862793, | |
| "sampling/importance_sampling_ratio/min": 0.625456690788269, | |
| "sampling/sampling_logp_difference/max": 0.3319031000137329, | |
| "sampling/sampling_logp_difference/mean": 0.011153844185173512, | |
| "step": 379, | |
| "step_time": 36.844649089001905 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05834899842739105, | |
| "clip_ratio/high_mean": 0.017242478381376714, | |
| "clip_ratio/low_mean": 0.03576550219440833, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05300798005191609, | |
| "entropy": 0.2637836243957281, | |
| "epoch": 0.0076, | |
| "grad_norm": 0.4454600512981415, | |
| "kl": 0.9274842478334904, | |
| "learning_rate": 9.999780861980607e-05, | |
| "loss": -0.0126, | |
| "step": 380, | |
| "step_time": 9.811575109000842 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010620915098115802, | |
| "clip_ratio/high_mean": 0.0026552287745289505, | |
| "clip_ratio/low_mean": 0.0009191176504828036, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003574346425011754, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1341.0, | |
| "completions/max_terminated_length": 1341.0, | |
| "completions/mean_length": 1219.8125, | |
| "completions/mean_terminated_length": 1219.8125, | |
| "completions/min_length": 991.0, | |
| "completions/min_terminated_length": 991.0, | |
| "entropy": 0.2292822152376175, | |
| "epoch": 0.00762, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6078642010688782, | |
| "kl": 0.803357319906354, | |
| "learning_rate": 9.999779586083025e-05, | |
| "loss": 0.004, | |
| "num_tokens": 21635298.0, | |
| "reward": 5.408005714416504, | |
| "reward_std": 9.926593780517578, | |
| "rewards/rollout_reward_func/mean": 5.4080047607421875, | |
| "rewards/rollout_reward_func/std": 11.208430290222168, | |
| "sampling/importance_sampling_ratio/max": 1.2364863157272339, | |
| "sampling/importance_sampling_ratio/mean": 0.997908890247345, | |
| "sampling/importance_sampling_ratio/min": 0.6723216772079468, | |
| "sampling/sampling_logp_difference/max": 0.38260674476623535, | |
| "sampling/sampling_logp_difference/mean": 0.007029087748378515, | |
| "step": 381, | |
| "step_time": 37.91188854600114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05433114105835557, | |
| "clip_ratio/high_mean": 0.014450840826611966, | |
| "clip_ratio/low_mean": 0.02246758935507387, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03691843029810116, | |
| "entropy": 0.21105156652629375, | |
| "epoch": 0.00764, | |
| "grad_norm": 0.3075491189956665, | |
| "kl": 0.9027222413569689, | |
| "learning_rate": 9.999778306481968e-05, | |
| "loss": -0.0043, | |
| "step": 382, | |
| "step_time": 9.498284126000726 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0036764706019312143, | |
| "clip_ratio/high_mean": 0.0009191176504828036, | |
| "clip_ratio/low_mean": 0.0008680555620230734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.001787173212505877, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1357.0, | |
| "completions/max_terminated_length": 1357.0, | |
| "completions/mean_length": 1213.765625, | |
| "completions/mean_terminated_length": 1213.765625, | |
| "completions/min_length": 957.0, | |
| "completions/min_terminated_length": 957.0, | |
| "entropy": 0.20513668935745955, | |
| "epoch": 0.00766, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6294713020324707, | |
| "kl": 0.7374063562601805, | |
| "learning_rate": 9.999777023177434e-05, | |
| "loss": 0.0252, | |
| "num_tokens": 21764144.0, | |
| "reward": 8.72990894317627, | |
| "reward_std": 11.312125205993652, | |
| "rewards/rollout_reward_func/mean": 8.729909896850586, | |
| "rewards/rollout_reward_func/std": 11.270212173461914, | |
| "sampling/importance_sampling_ratio/max": 1.667926549911499, | |
| "sampling/importance_sampling_ratio/mean": 1.0118814706802368, | |
| "sampling/importance_sampling_ratio/min": 0.7219305038452148, | |
| "sampling/sampling_logp_difference/max": 0.31063222885131836, | |
| "sampling/sampling_logp_difference/mean": 0.007431398145854473, | |
| "step": 383, | |
| "step_time": 37.184195774003456 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024509804090484977, | |
| "clip_ratio/high_mean": 0.006995506584644318, | |
| "clip_ratio/low_mean": 0.034743722644634545, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.041739229462109506, | |
| "entropy": 0.19153737649321556, | |
| "epoch": 0.00768, | |
| "grad_norm": 0.37729939818382263, | |
| "kl": 1.0056524686515331, | |
| "learning_rate": 9.999775736169427e-05, | |
| "loss": 0.0245, | |
| "step": 384, | |
| "step_time": 8.749921898001048 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.004579809028655291, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0060679042944684625, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1551.0, | |
| "completions/max_terminated_length": 1551.0, | |
| "completions/mean_length": 1410.3125, | |
| "completions/mean_terminated_length": 1410.3125, | |
| "completions/min_length": 765.0, | |
| "completions/min_terminated_length": 765.0, | |
| "entropy": 0.20708153676241636, | |
| "epoch": 0.0077, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.738293468952179, | |
| "kl": 0.887890812009573, | |
| "learning_rate": 9.99977444545795e-05, | |
| "loss": -0.0628, | |
| "num_tokens": 21905662.0, | |
| "reward": 9.202791213989258, | |
| "reward_std": 15.181166648864746, | |
| "rewards/rollout_reward_func/mean": 9.202792167663574, | |
| "rewards/rollout_reward_func/std": 15.67770767211914, | |
| "sampling/importance_sampling_ratio/max": 1.5441806316375732, | |
| "sampling/importance_sampling_ratio/mean": 0.9917982816696167, | |
| "sampling/importance_sampling_ratio/min": 5.679499839178481e-13, | |
| "sampling/sampling_logp_difference/max": 22.66815948486328, | |
| "sampling/sampling_logp_difference/mean": 0.030492324382066727, | |
| "step": 385, | |
| "step_time": 39.20689014000345 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023971861926838756, | |
| "clip_ratio/high_mean": 0.007481060747522861, | |
| "clip_ratio/low_mean": 0.03716492815874517, | |
| "clip_ratio/low_min": 0.0029761905316263437, | |
| "clip_ratio/region_mean": 0.04464598890626803, | |
| "entropy": 0.1879758802242577, | |
| "epoch": 0.00772, | |
| "grad_norm": 0.41567400097846985, | |
| "kl": 0.8819043859839439, | |
| "learning_rate": 9.999773151042999e-05, | |
| "loss": -0.0737, | |
| "step": 386, | |
| "step_time": 10.475744582005063 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009424603311344981, | |
| "clip_ratio/high_mean": 0.0023561508278362453, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031001984607428312, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1556.0, | |
| "completions/max_terminated_length": 1556.0, | |
| "completions/mean_length": 1433.34375, | |
| "completions/mean_terminated_length": 1433.34375, | |
| "completions/min_length": 1211.0, | |
| "completions/min_terminated_length": 1211.0, | |
| "entropy": 0.15365674067288637, | |
| "epoch": 0.00774, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5962560176849365, | |
| "kl": 0.6632435545325279, | |
| "learning_rate": 9.99977185292458e-05, | |
| "loss": 0.0216, | |
| "num_tokens": 22048591.0, | |
| "reward": 13.268250465393066, | |
| "reward_std": 13.775822639465332, | |
| "rewards/rollout_reward_func/mean": 13.268250465393066, | |
| "rewards/rollout_reward_func/std": 14.63206958770752, | |
| "sampling/importance_sampling_ratio/max": 1.2218079566955566, | |
| "sampling/importance_sampling_ratio/mean": 0.9793609380722046, | |
| "sampling/importance_sampling_ratio/min": 0.6325286626815796, | |
| "sampling/sampling_logp_difference/max": 0.38329482078552246, | |
| "sampling/sampling_logp_difference/mean": 0.0064071910455822945, | |
| "step": 387, | |
| "step_time": 41.232673029999205 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05530754057690501, | |
| "clip_ratio/high_mean": 0.01680307579226792, | |
| "clip_ratio/low_mean": 0.014248512219637632, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.031051588244736195, | |
| "entropy": 0.14256418915465474, | |
| "epoch": 0.00776, | |
| "grad_norm": 0.527172863483429, | |
| "kl": 0.646535612642765, | |
| "learning_rate": 9.999770551102692e-05, | |
| "loss": 0.0167, | |
| "step": 388, | |
| "step_time": 10.636822301992652 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0031250000465661287, | |
| "clip_ratio/high_mean": 0.0007812500116415322, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0015252976445481181, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1545.0, | |
| "completions/max_terminated_length": 1545.0, | |
| "completions/mean_length": 1429.21875, | |
| "completions/mean_terminated_length": 1429.21875, | |
| "completions/min_length": 1226.0, | |
| "completions/min_terminated_length": 1226.0, | |
| "entropy": 0.14011064730584621, | |
| "epoch": 0.00778, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5465406775474548, | |
| "kl": 0.6449617743492126, | |
| "learning_rate": 9.999769245577337e-05, | |
| "loss": -0.0416, | |
| "num_tokens": 22191273.0, | |
| "reward": 10.615909576416016, | |
| "reward_std": 10.947202682495117, | |
| "rewards/rollout_reward_func/mean": 10.615909576416016, | |
| "rewards/rollout_reward_func/std": 12.735282897949219, | |
| "sampling/importance_sampling_ratio/max": 2.317744016647339, | |
| "sampling/importance_sampling_ratio/mean": 1.0246381759643555, | |
| "sampling/importance_sampling_ratio/min": 0.2836526930332184, | |
| "sampling/sampling_logp_difference/max": 1.3213729858398438, | |
| "sampling/sampling_logp_difference/mean": 0.008526146411895752, | |
| "step": 389, | |
| "step_time": 41.60411787599878 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03645833395421505, | |
| "clip_ratio/high_mean": 0.01361607201397419, | |
| "clip_ratio/low_mean": 0.013582785322796553, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02719885722035542, | |
| "entropy": 0.15200490225106478, | |
| "epoch": 0.0078, | |
| "grad_norm": 0.4452749192714691, | |
| "kl": 0.5898908544331789, | |
| "learning_rate": 9.999767936348516e-05, | |
| "loss": -0.05, | |
| "step": 390, | |
| "step_time": 10.0632308130007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002232142898719758, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1550.0, | |
| "completions/max_terminated_length": 1550.0, | |
| "completions/mean_length": 1416.0625, | |
| "completions/mean_terminated_length": 1416.0625, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "entropy": 0.16107679810374975, | |
| "epoch": 0.00782, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5122284889221191, | |
| "kl": 0.5392248686403036, | |
| "learning_rate": 9.999766623416232e-05, | |
| "loss": -0.0577, | |
| "num_tokens": 22333164.0, | |
| "reward": 14.949935913085938, | |
| "reward_std": 16.67510414123535, | |
| "rewards/rollout_reward_func/mean": 14.949935913085938, | |
| "rewards/rollout_reward_func/std": 18.703474044799805, | |
| "sampling/importance_sampling_ratio/max": 1.4272053241729736, | |
| "sampling/importance_sampling_ratio/mean": 0.9347177743911743, | |
| "sampling/importance_sampling_ratio/min": 0.16998553276062012, | |
| "sampling/sampling_logp_difference/max": 1.3626210689544678, | |
| "sampling/sampling_logp_difference/mean": 0.009718427434563637, | |
| "step": 391, | |
| "step_time": 39.99297312899398 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02976190554909408, | |
| "clip_ratio/high_mean": 0.008928571594879031, | |
| "clip_ratio/low_mean": 0.015298011188860983, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.024226582725532353, | |
| "entropy": 0.14390948927029967, | |
| "epoch": 0.00784, | |
| "grad_norm": 0.4105764627456665, | |
| "kl": 0.5607901010662317, | |
| "learning_rate": 9.999765306780482e-05, | |
| "loss": -0.0626, | |
| "step": 392, | |
| "step_time": 10.043341166003302 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009077381109818816, | |
| "clip_ratio/high_mean": 0.002269345277454704, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00301339291036129, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1554.0, | |
| "completions/max_terminated_length": 1554.0, | |
| "completions/mean_length": 1443.640625, | |
| "completions/mean_terminated_length": 1443.640625, | |
| "completions/min_length": 1069.0, | |
| "completions/min_terminated_length": 1069.0, | |
| "entropy": 0.12537508364766836, | |
| "epoch": 0.00786, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7089640498161316, | |
| "kl": 0.9093821812421083, | |
| "learning_rate": 9.99976398644127e-05, | |
| "loss": 0.0186, | |
| "num_tokens": 22476782.0, | |
| "reward": 11.492524147033691, | |
| "reward_std": 15.943157196044922, | |
| "rewards/rollout_reward_func/mean": 11.492524147033691, | |
| "rewards/rollout_reward_func/std": 16.71925163269043, | |
| "sampling/importance_sampling_ratio/max": 1.7644160985946655, | |
| "sampling/importance_sampling_ratio/mean": 0.9914994239807129, | |
| "sampling/importance_sampling_ratio/min": 0.7484045028686523, | |
| "sampling/sampling_logp_difference/max": 0.4207209348678589, | |
| "sampling/sampling_logp_difference/mean": 0.006036281120032072, | |
| "step": 393, | |
| "step_time": 40.15463091899983 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01800595293752849, | |
| "clip_ratio/high_mean": 0.005245535809081048, | |
| "clip_ratio/low_mean": 0.018960813991725445, | |
| "clip_ratio/low_min": 0.0029761905316263437, | |
| "clip_ratio/region_mean": 0.024206349917221814, | |
| "entropy": 0.11524984752759337, | |
| "epoch": 0.00788, | |
| "grad_norm": 0.7076042890548706, | |
| "kl": 0.7414491530507803, | |
| "learning_rate": 9.9997626623986e-05, | |
| "loss": 0.0115, | |
| "step": 394, | |
| "step_time": 10.618017185999634 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014880952658131719, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1563.0, | |
| "completions/max_terminated_length": 1563.0, | |
| "completions/mean_length": 1443.5, | |
| "completions/mean_terminated_length": 1443.5, | |
| "completions/min_length": 427.0, | |
| "completions/min_terminated_length": 427.0, | |
| "entropy": 0.12252500653266907, | |
| "epoch": 0.0079, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7846469283103943, | |
| "kl": 0.7317866403609514, | |
| "learning_rate": 9.999761334652469e-05, | |
| "loss": 0.0075, | |
| "num_tokens": 22620477.0, | |
| "reward": 11.849661827087402, | |
| "reward_std": 16.187042236328125, | |
| "rewards/rollout_reward_func/mean": 11.849662780761719, | |
| "rewards/rollout_reward_func/std": 17.399803161621094, | |
| "sampling/importance_sampling_ratio/max": 1.4447773694992065, | |
| "sampling/importance_sampling_ratio/mean": 1.0075819492340088, | |
| "sampling/importance_sampling_ratio/min": 0.663360595703125, | |
| "sampling/sampling_logp_difference/max": 0.43144845962524414, | |
| "sampling/sampling_logp_difference/mean": 0.007304108701646328, | |
| "step": 395, | |
| "step_time": 40.574595107005734 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.033482143422588706, | |
| "clip_ratio/high_mean": 0.011425047181546688, | |
| "clip_ratio/low_mean": 0.01829117111628875, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02971621841425076, | |
| "entropy": 0.12393791414797306, | |
| "epoch": 0.00792, | |
| "grad_norm": 0.38290056586265564, | |
| "kl": 0.7317893952131271, | |
| "learning_rate": 9.999760003202881e-05, | |
| "loss": 0.0033, | |
| "step": 396, | |
| "step_time": 10.742739806995814 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01205357164144516, | |
| "clip_ratio/high_mean": 0.00301339291036129, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003757440543267876, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1543.0, | |
| "completions/max_terminated_length": 1543.0, | |
| "completions/mean_length": 1444.578125, | |
| "completions/mean_terminated_length": 1444.578125, | |
| "completions/min_length": 1287.0, | |
| "completions/min_terminated_length": 1287.0, | |
| "entropy": 0.13034009747207165, | |
| "epoch": 0.00794, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7314363121986389, | |
| "kl": 0.6178351659327745, | |
| "learning_rate": 9.999758668049833e-05, | |
| "loss": -0.0157, | |
| "num_tokens": 22764146.0, | |
| "reward": 11.904011726379395, | |
| "reward_std": 15.453010559082031, | |
| "rewards/rollout_reward_func/mean": 11.904010772705078, | |
| "rewards/rollout_reward_func/std": 16.291580200195312, | |
| "sampling/importance_sampling_ratio/max": 1.2977502346038818, | |
| "sampling/importance_sampling_ratio/mean": 0.9704984426498413, | |
| "sampling/importance_sampling_ratio/min": 0.6586284637451172, | |
| "sampling/sampling_logp_difference/max": 0.34184467792510986, | |
| "sampling/sampling_logp_difference/mean": 0.006397986318916082, | |
| "step": 397, | |
| "step_time": 40.79752054799974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.043154762824997306, | |
| "clip_ratio/high_mean": 0.014508928987197578, | |
| "clip_ratio/low_mean": 0.025279997498728335, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.039788926660548896, | |
| "entropy": 0.11147738387808204, | |
| "epoch": 0.00796, | |
| "grad_norm": 0.28320473432540894, | |
| "kl": 0.7334012817591429, | |
| "learning_rate": 9.999757329193333e-05, | |
| "loss": -0.021, | |
| "step": 398, | |
| "step_time": 9.331709539997973 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0028409091755747795, | |
| "clip_ratio/high_mean": 0.0007102272938936949, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014542749268002808, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1546.0, | |
| "completions/max_terminated_length": 1546.0, | |
| "completions/mean_length": 1455.65625, | |
| "completions/mean_terminated_length": 1455.65625, | |
| "completions/min_length": 1290.0, | |
| "completions/min_terminated_length": 1290.0, | |
| "entropy": 0.12526550004258752, | |
| "epoch": 0.00798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7262594103813171, | |
| "kl": 0.6103415302932262, | |
| "learning_rate": 9.999755986633378e-05, | |
| "loss": -0.0318, | |
| "num_tokens": 22908577.0, | |
| "reward": 9.555159568786621, | |
| "reward_std": 12.746781349182129, | |
| "rewards/rollout_reward_func/mean": 9.555160522460938, | |
| "rewards/rollout_reward_func/std": 14.475045204162598, | |
| "sampling/importance_sampling_ratio/max": 1.3095201253890991, | |
| "sampling/importance_sampling_ratio/mean": 0.9779645204544067, | |
| "sampling/importance_sampling_ratio/min": 7.978658610397404e-16, | |
| "sampling/sampling_logp_difference/max": 27.056884765625, | |
| "sampling/sampling_logp_difference/mean": 0.034079719334840775, | |
| "step": 399, | |
| "step_time": 41.13124246700136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.033107627648860216, | |
| "clip_ratio/high_mean": 0.010509049869142473, | |
| "clip_ratio/low_mean": 0.02362351247575134, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03413256263593212, | |
| "entropy": 0.1209962465800345, | |
| "epoch": 0.008, | |
| "grad_norm": 0.35216766595840454, | |
| "kl": 0.6524146590381861, | |
| "learning_rate": 9.99975464036997e-05, | |
| "loss": -0.044, | |
| "step": 400, | |
| "step_time": 10.55650918399806 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.0015252976445481181, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00301339291036129, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1555.0, | |
| "completions/max_terminated_length": 1555.0, | |
| "completions/mean_length": 1432.03125, | |
| "completions/mean_terminated_length": 1432.03125, | |
| "completions/min_length": 1248.0, | |
| "completions/min_terminated_length": 1248.0, | |
| "entropy": 0.11135548166930676, | |
| "epoch": 0.00802, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.133269190788269, | |
| "kl": 0.7685734387487173, | |
| "learning_rate": 9.99975329040311e-05, | |
| "loss": 0.0237, | |
| "num_tokens": 23051446.0, | |
| "reward": 10.270038604736328, | |
| "reward_std": 15.682093620300293, | |
| "rewards/rollout_reward_func/mean": 10.270038604736328, | |
| "rewards/rollout_reward_func/std": 16.255008697509766, | |
| "sampling/importance_sampling_ratio/max": 1.4514762163162231, | |
| "sampling/importance_sampling_ratio/mean": 1.0226449966430664, | |
| "sampling/importance_sampling_ratio/min": 0.7271938920021057, | |
| "sampling/sampling_logp_difference/max": 0.45901012420654297, | |
| "sampling/sampling_logp_difference/mean": 0.005241828970611095, | |
| "step": 401, | |
| "step_time": 41.26497857599861 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03290043352171779, | |
| "clip_ratio/high_mean": 0.008225108380429447, | |
| "clip_ratio/low_mean": 0.015327381319366395, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023552489699795842, | |
| "entropy": 0.11777450842782855, | |
| "epoch": 0.00804, | |
| "grad_norm": 0.928424060344696, | |
| "kl": 1.0143736563622952, | |
| "learning_rate": 9.999751936732799e-05, | |
| "loss": 0.0269, | |
| "step": 402, | |
| "step_time": 10.720703897995918 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571594879031, | |
| "clip_ratio/high_mean": 0.0029761905316263437, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029761905316263437, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1558.0, | |
| "completions/max_terminated_length": 1558.0, | |
| "completions/mean_length": 1402.484375, | |
| "completions/mean_terminated_length": 1402.484375, | |
| "completions/min_length": 274.0, | |
| "completions/min_terminated_length": 274.0, | |
| "entropy": 0.12977053970098495, | |
| "epoch": 0.00806, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5543701648712158, | |
| "kl": 0.6270178612321615, | |
| "learning_rate": 9.999750579359041e-05, | |
| "loss": 0.0183, | |
| "num_tokens": 23192365.0, | |
| "reward": 10.964115142822266, | |
| "reward_std": 14.9024658203125, | |
| "rewards/rollout_reward_func/mean": 10.964115142822266, | |
| "rewards/rollout_reward_func/std": 15.60954475402832, | |
| "sampling/importance_sampling_ratio/max": 1.254056453704834, | |
| "sampling/importance_sampling_ratio/mean": 0.9866700768470764, | |
| "sampling/importance_sampling_ratio/min": 0.661080539226532, | |
| "sampling/sampling_logp_difference/max": 0.2775760889053345, | |
| "sampling/sampling_logp_difference/mean": 0.005836261436343193, | |
| "step": 403, | |
| "step_time": 40.087825246997454 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029910714831203222, | |
| "clip_ratio/high_mean": 0.009676001209300011, | |
| "clip_ratio/low_mean": 0.016021825780626386, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025697826989926398, | |
| "entropy": 0.13238740153610706, | |
| "epoch": 0.00808, | |
| "grad_norm": 0.44548580050468445, | |
| "kl": 0.718162702396512, | |
| "learning_rate": 9.999749218281836e-05, | |
| "loss": 0.0147, | |
| "step": 404, | |
| "step_time": 9.659750243004964 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014880952658131719, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1552.0, | |
| "completions/max_terminated_length": 1552.0, | |
| "completions/mean_length": 1440.421875, | |
| "completions/mean_terminated_length": 1440.421875, | |
| "completions/min_length": 1154.0, | |
| "completions/min_terminated_length": 1154.0, | |
| "entropy": 0.12935744831338525, | |
| "epoch": 0.0081, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5480543375015259, | |
| "kl": 0.6046669036149979, | |
| "learning_rate": 9.999747853501184e-05, | |
| "loss": 0.0137, | |
| "num_tokens": 23335798.0, | |
| "reward": 12.202452659606934, | |
| "reward_std": 18.661951065063477, | |
| "rewards/rollout_reward_func/mean": 12.20245361328125, | |
| "rewards/rollout_reward_func/std": 20.890966415405273, | |
| "sampling/importance_sampling_ratio/max": 1.5541430711746216, | |
| "sampling/importance_sampling_ratio/mean": 1.0242815017700195, | |
| "sampling/importance_sampling_ratio/min": 0.6801992058753967, | |
| "sampling/sampling_logp_difference/max": 0.38781797885894775, | |
| "sampling/sampling_logp_difference/mean": 0.006136234849691391, | |
| "step": 405, | |
| "step_time": 40.755317154002114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02380952425301075, | |
| "clip_ratio/high_mean": 0.00744047638727352, | |
| "clip_ratio/low_mean": 0.018129960633814335, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025570437079295516, | |
| "entropy": 0.12149734795093536, | |
| "epoch": 0.00812, | |
| "grad_norm": 0.26514580845832825, | |
| "kl": 0.64109767973423, | |
| "learning_rate": 9.999746485017087e-05, | |
| "loss": 0.0087, | |
| "step": 406, | |
| "step_time": 10.136832774996947 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0037202381645329297, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1554.0, | |
| "completions/max_terminated_length": 1554.0, | |
| "completions/mean_length": 1405.703125, | |
| "completions/mean_terminated_length": 1405.703125, | |
| "completions/min_length": 666.0, | |
| "completions/min_terminated_length": 666.0, | |
| "entropy": 0.12104977620765567, | |
| "epoch": 0.00814, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.35428720712661743, | |
| "kl": 0.6081782300025225, | |
| "learning_rate": 9.999745112829547e-05, | |
| "loss": 0.0047, | |
| "num_tokens": 23476941.0, | |
| "reward": 10.940488815307617, | |
| "reward_std": 14.940820693969727, | |
| "rewards/rollout_reward_func/mean": 10.940488815307617, | |
| "rewards/rollout_reward_func/std": 15.13664436340332, | |
| "sampling/importance_sampling_ratio/max": 1.254475712776184, | |
| "sampling/importance_sampling_ratio/mean": 0.9845165014266968, | |
| "sampling/importance_sampling_ratio/min": 0.6197980642318726, | |
| "sampling/sampling_logp_difference/max": 0.40376973152160645, | |
| "sampling/sampling_logp_difference/mean": 0.00637152511626482, | |
| "step": 407, | |
| "step_time": 40.52080072600438 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02380952425301075, | |
| "clip_ratio/high_mean": 0.007440476329065859, | |
| "clip_ratio/low_mean": 0.026450893783476204, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.033891370287165046, | |
| "entropy": 0.11284881783649325, | |
| "epoch": 0.00816, | |
| "grad_norm": 0.26076704263687134, | |
| "kl": 0.6312750466167927, | |
| "learning_rate": 9.999743736938565e-05, | |
| "loss": -0.0013, | |
| "step": 408, | |
| "step_time": 10.76028649699765 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0028409091755747795, | |
| "clip_ratio/high_mean": 0.0007102272938936949, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029423701926134527, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1546.0, | |
| "completions/max_terminated_length": 1546.0, | |
| "completions/mean_length": 1414.90625, | |
| "completions/mean_terminated_length": 1414.90625, | |
| "completions/min_length": 419.0, | |
| "completions/min_terminated_length": 419.0, | |
| "entropy": 0.11457140510901809, | |
| "epoch": 0.00818, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.668945848941803, | |
| "kl": 0.5923841055482626, | |
| "learning_rate": 9.999742357344142e-05, | |
| "loss": 0.0624, | |
| "num_tokens": 23618723.0, | |
| "reward": 10.537452697753906, | |
| "reward_std": 15.241682052612305, | |
| "rewards/rollout_reward_func/mean": 10.537453651428223, | |
| "rewards/rollout_reward_func/std": 16.505765914916992, | |
| "sampling/importance_sampling_ratio/max": 1.2935158014297485, | |
| "sampling/importance_sampling_ratio/mean": 0.9813590049743652, | |
| "sampling/importance_sampling_ratio/min": 2.974116992179171e-14, | |
| "sampling/sampling_logp_difference/max": 25.953086853027344, | |
| "sampling/sampling_logp_difference/mean": 0.028037957847118378, | |
| "step": 409, | |
| "step_time": 40.691261925003346 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04437229549512267, | |
| "clip_ratio/high_mean": 0.011837121448479593, | |
| "clip_ratio/low_mean": 0.016443452972453088, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.028280574886593968, | |
| "entropy": 0.1118474374525249, | |
| "epoch": 0.0082, | |
| "grad_norm": 0.2630373537540436, | |
| "kl": 0.6832827776670456, | |
| "learning_rate": 9.999740974046282e-05, | |
| "loss": 0.0566, | |
| "step": 410, | |
| "step_time": 9.717429660999187 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011904762126505375, | |
| "clip_ratio/high_mean": 0.004464285797439516, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334303461015, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1543.0, | |
| "completions/max_terminated_length": 1543.0, | |
| "completions/mean_length": 1393.609375, | |
| "completions/mean_terminated_length": 1393.609375, | |
| "completions/min_length": 298.0, | |
| "completions/min_terminated_length": 298.0, | |
| "entropy": 0.1146247279830277, | |
| "epoch": 0.00822, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.47182416915893555, | |
| "kl": 0.5627781376242638, | |
| "learning_rate": 9.999739587044981e-05, | |
| "loss": -0.0341, | |
| "num_tokens": 23759122.0, | |
| "reward": 8.971721649169922, | |
| "reward_std": 14.443693161010742, | |
| "rewards/rollout_reward_func/mean": 8.971721649169922, | |
| "rewards/rollout_reward_func/std": 14.68343448638916, | |
| "sampling/importance_sampling_ratio/max": 1.243363857269287, | |
| "sampling/importance_sampling_ratio/mean": 0.9929588437080383, | |
| "sampling/importance_sampling_ratio/min": 0.7046716809272766, | |
| "sampling/sampling_logp_difference/max": 0.35747838020324707, | |
| "sampling/sampling_logp_difference/mean": 0.005684119649231434, | |
| "step": 411, | |
| "step_time": 39.962890398002855 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.035714286379516125, | |
| "clip_ratio/high_mean": 0.009672619227785617, | |
| "clip_ratio/low_mean": 0.01767113123787567, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02734375116415322, | |
| "entropy": 0.11477407393977046, | |
| "epoch": 0.00824, | |
| "grad_norm": 0.24663475155830383, | |
| "kl": 0.6022106558084488, | |
| "learning_rate": 9.999738196340245e-05, | |
| "loss": -0.0386, | |
| "step": 412, | |
| "step_time": 9.870993509000982 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029761905316263437, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1563.0, | |
| "completions/max_terminated_length": 1563.0, | |
| "completions/mean_length": 1434.671875, | |
| "completions/mean_terminated_length": 1434.671875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.11288065044209361, | |
| "epoch": 0.00826, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.38144180178642273, | |
| "kl": 0.7383872698992491, | |
| "learning_rate": 9.999736801932072e-05, | |
| "loss": 0.0133, | |
| "num_tokens": 23902181.0, | |
| "reward": 13.304646492004395, | |
| "reward_std": 20.157991409301758, | |
| "rewards/rollout_reward_func/mean": 13.304647445678711, | |
| "rewards/rollout_reward_func/std": 21.064607620239258, | |
| "sampling/importance_sampling_ratio/max": 1.3603137731552124, | |
| "sampling/importance_sampling_ratio/mean": 1.0158387422561646, | |
| "sampling/importance_sampling_ratio/min": 0.7469893097877502, | |
| "sampling/sampling_logp_difference/max": 0.2501299977302551, | |
| "sampling/sampling_logp_difference/mean": 0.004556077066808939, | |
| "step": 413, | |
| "step_time": 41.25988831600807 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017857143422588706, | |
| "clip_ratio/high_mean": 0.0044642858556471765, | |
| "clip_ratio/low_mean": 0.012648809934034944, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01711309573147446, | |
| "entropy": 0.10936349909752607, | |
| "epoch": 0.00828, | |
| "grad_norm": 0.2556546628475189, | |
| "kl": 0.7252329587936401, | |
| "learning_rate": 9.999735403820466e-05, | |
| "loss": 0.0102, | |
| "step": 414, | |
| "step_time": 10.573283408997668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002232142898719758, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1562.0, | |
| "completions/max_terminated_length": 1562.0, | |
| "completions/mean_length": 1493.640625, | |
| "completions/mean_terminated_length": 1493.640625, | |
| "completions/min_length": 1359.0, | |
| "completions/min_terminated_length": 1359.0, | |
| "entropy": 0.11034470843151212, | |
| "epoch": 0.0083, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7242380380630493, | |
| "kl": 0.6212767362594604, | |
| "learning_rate": 9.999734002005428e-05, | |
| "loss": -0.0155, | |
| "num_tokens": 24049141.0, | |
| "reward": 9.928826332092285, | |
| "reward_std": 15.976888656616211, | |
| "rewards/rollout_reward_func/mean": 9.928826332092285, | |
| "rewards/rollout_reward_func/std": 16.414718627929688, | |
| "sampling/importance_sampling_ratio/max": 1.3260316848754883, | |
| "sampling/importance_sampling_ratio/mean": 1.0085797309875488, | |
| "sampling/importance_sampling_ratio/min": 0.5519727468490601, | |
| "sampling/sampling_logp_difference/max": 0.5959200859069824, | |
| "sampling/sampling_logp_difference/mean": 0.006388316862285137, | |
| "step": 415, | |
| "step_time": 40.88382640199961 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.014880952658131719, | |
| "clip_ratio/high_mean": 0.005952381121460348, | |
| "clip_ratio/low_mean": 0.019494048377964646, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025446429615840316, | |
| "entropy": 0.09857920417562127, | |
| "epoch": 0.00832, | |
| "grad_norm": 0.39599546790122986, | |
| "kl": 0.7278024889528751, | |
| "learning_rate": 9.99973259648696e-05, | |
| "loss": -0.013, | |
| "step": 416, | |
| "step_time": 10.850284384998304 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002232142898719758, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1541.0, | |
| "completions/max_terminated_length": 1541.0, | |
| "completions/mean_length": 1388.96875, | |
| "completions/mean_terminated_length": 1388.96875, | |
| "completions/min_length": 188.0, | |
| "completions/min_terminated_length": 188.0, | |
| "entropy": 0.1026167522650212, | |
| "epoch": 0.00834, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4579165577888489, | |
| "kl": 0.8239834625273943, | |
| "learning_rate": 9.99973118726506e-05, | |
| "loss": -0.0484, | |
| "num_tokens": 24189178.0, | |
| "reward": 12.621437072753906, | |
| "reward_std": 16.67880630493164, | |
| "rewards/rollout_reward_func/mean": 12.621437072753906, | |
| "rewards/rollout_reward_func/std": 17.352924346923828, | |
| "sampling/importance_sampling_ratio/max": 1.2838397026062012, | |
| "sampling/importance_sampling_ratio/mean": 1.0156192779541016, | |
| "sampling/importance_sampling_ratio/min": 0.6750461459159851, | |
| "sampling/sampling_logp_difference/max": 0.2394113540649414, | |
| "sampling/sampling_logp_difference/mean": 0.004534607753157616, | |
| "step": 417, | |
| "step_time": 39.856104094997136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.038690477376803756, | |
| "clip_ratio/high_mean": 0.011904762184713036, | |
| "clip_ratio/low_mean": 0.01116071455180645, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02306547696935013, | |
| "entropy": 0.11136638512834907, | |
| "epoch": 0.00836, | |
| "grad_norm": 0.2160414755344391, | |
| "kl": 0.6315647587180138, | |
| "learning_rate": 9.999729774339733e-05, | |
| "loss": -0.0554, | |
| "step": 418, | |
| "step_time": 9.950184918994637 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0028409091755747795, | |
| "clip_ratio/high_mean": 0.0007102272938936949, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0007102272938936949, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1545.0, | |
| "completions/max_terminated_length": 1545.0, | |
| "completions/mean_length": 1415.34375, | |
| "completions/mean_terminated_length": 1415.34375, | |
| "completions/min_length": 741.0, | |
| "completions/min_terminated_length": 741.0, | |
| "entropy": 0.12300179339945316, | |
| "epoch": 0.00838, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.35927513241767883, | |
| "kl": 0.563910448923707, | |
| "learning_rate": 9.999728357710979e-05, | |
| "loss": -0.0024, | |
| "num_tokens": 24330939.0, | |
| "reward": 10.211483001708984, | |
| "reward_std": 12.243392944335938, | |
| "rewards/rollout_reward_func/mean": 10.211483001708984, | |
| "rewards/rollout_reward_func/std": 12.923269271850586, | |
| "sampling/importance_sampling_ratio/max": 1.561508297920227, | |
| "sampling/importance_sampling_ratio/mean": 0.9852752089500427, | |
| "sampling/importance_sampling_ratio/min": 0.6525661945343018, | |
| "sampling/sampling_logp_difference/max": 0.421316921710968, | |
| "sampling/sampling_logp_difference/mean": 0.005881062708795071, | |
| "step": 419, | |
| "step_time": 40.82010957399871 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023403680184856057, | |
| "clip_ratio/high_mean": 0.00801836303435266, | |
| "clip_ratio/low_mean": 0.005332341359462589, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01335070439381525, | |
| "entropy": 0.12413196917623281, | |
| "epoch": 0.0084, | |
| "grad_norm": 0.22808168828487396, | |
| "kl": 0.5641085561364889, | |
| "learning_rate": 9.999726937378799e-05, | |
| "loss": -0.0082, | |
| "step": 420, | |
| "step_time": 9.7226802879959 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571594879031, | |
| "clip_ratio/high_mean": 0.002232142898719758, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004464285797439516, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1535.0, | |
| "completions/max_terminated_length": 1535.0, | |
| "completions/mean_length": 1441.640625, | |
| "completions/mean_terminated_length": 1441.640625, | |
| "completions/min_length": 864.0, | |
| "completions/min_terminated_length": 864.0, | |
| "entropy": 0.1284960494376719, | |
| "epoch": 0.00842, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.532646656036377, | |
| "kl": 0.7414810676127672, | |
| "learning_rate": 9.999725513343196e-05, | |
| "loss": 0.0034, | |
| "num_tokens": 24474440.0, | |
| "reward": 15.56411361694336, | |
| "reward_std": 16.717456817626953, | |
| "rewards/rollout_reward_func/mean": 15.56411361694336, | |
| "rewards/rollout_reward_func/std": 16.81290626525879, | |
| "sampling/importance_sampling_ratio/max": 1.2900909185409546, | |
| "sampling/importance_sampling_ratio/mean": 1.0089163780212402, | |
| "sampling/importance_sampling_ratio/min": 0.6302499175071716, | |
| "sampling/sampling_logp_difference/max": 0.41839098930358887, | |
| "sampling/sampling_logp_difference/mean": 0.006366787478327751, | |
| "step": 421, | |
| "step_time": 41.64962637100143 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833333721384406, | |
| "clip_ratio/high_mean": 0.0052083334303461015, | |
| "clip_ratio/low_mean": 0.014136905199848115, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.019345238688401878, | |
| "entropy": 0.12450070818886161, | |
| "epoch": 0.00844, | |
| "grad_norm": 0.3132474422454834, | |
| "kl": 0.7047660015523434, | |
| "learning_rate": 9.999724085604169e-05, | |
| "loss": -0.0014, | |
| "step": 422, | |
| "step_time": 10.727001868001025 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029761905316263437, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1557.0, | |
| "completions/max_terminated_length": 1557.0, | |
| "completions/mean_length": 1452.453125, | |
| "completions/mean_terminated_length": 1452.453125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "entropy": 0.11882536578923464, | |
| "epoch": 0.00846, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7803420424461365, | |
| "kl": 0.8879449907690287, | |
| "learning_rate": 9.999722654161722e-05, | |
| "loss": -0.0437, | |
| "num_tokens": 24618707.0, | |
| "reward": 11.537307739257812, | |
| "reward_std": 16.87006187438965, | |
| "rewards/rollout_reward_func/mean": 11.537307739257812, | |
| "rewards/rollout_reward_func/std": 18.111291885375977, | |
| "sampling/importance_sampling_ratio/max": 2.1790900230407715, | |
| "sampling/importance_sampling_ratio/mean": 1.0079734325408936, | |
| "sampling/importance_sampling_ratio/min": 0.6660839319229126, | |
| "sampling/sampling_logp_difference/max": 1.0955865383148193, | |
| "sampling/sampling_logp_difference/mean": 0.0059229484759271145, | |
| "step": 423, | |
| "step_time": 39.62021958500554 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02380952425301075, | |
| "clip_ratio/high_mean": 0.006696428696159273, | |
| "clip_ratio/low_mean": 0.015560741710942239, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02225717029068619, | |
| "entropy": 0.12485062563791871, | |
| "epoch": 0.00848, | |
| "grad_norm": 0.31361132860183716, | |
| "kl": 0.7454855944961309, | |
| "learning_rate": 9.999721219015854e-05, | |
| "loss": -0.0541, | |
| "step": 424, | |
| "step_time": 10.1194757400026 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014880952658131719, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1537.0, | |
| "completions/max_terminated_length": 1537.0, | |
| "completions/mean_length": 1414.84375, | |
| "completions/mean_terminated_length": 1414.84375, | |
| "completions/min_length": 690.0, | |
| "completions/min_terminated_length": 690.0, | |
| "entropy": 0.1313102599233389, | |
| "epoch": 0.0085, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5788205862045288, | |
| "kl": 0.6796710211783648, | |
| "learning_rate": 9.999719780166567e-05, | |
| "loss": -0.0346, | |
| "num_tokens": 24760444.0, | |
| "reward": 10.583850860595703, | |
| "reward_std": 15.813437461853027, | |
| "rewards/rollout_reward_func/mean": 10.583850860595703, | |
| "rewards/rollout_reward_func/std": 15.782630920410156, | |
| "sampling/importance_sampling_ratio/max": 1.3160440921783447, | |
| "sampling/importance_sampling_ratio/mean": 0.9774030447006226, | |
| "sampling/importance_sampling_ratio/min": 0.7505905628204346, | |
| "sampling/sampling_logp_difference/max": 0.2754938304424286, | |
| "sampling/sampling_logp_difference/mean": 0.00678935507312417, | |
| "step": 425, | |
| "step_time": 41.97174792000442 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833333721384406, | |
| "clip_ratio/high_mean": 0.0052083334303461015, | |
| "clip_ratio/low_mean": 0.017931548063643277, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023139881726820022, | |
| "entropy": 0.13454774813726544, | |
| "epoch": 0.00852, | |
| "grad_norm": 0.24161121249198914, | |
| "kl": 0.6602058243006468, | |
| "learning_rate": 9.999718337613865e-05, | |
| "loss": -0.0446, | |
| "step": 426, | |
| "step_time": 9.663861974999236 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571594879031, | |
| "clip_ratio/high_mean": 0.002232142898719758, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0037202381645329297, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1540.0, | |
| "completions/max_terminated_length": 1540.0, | |
| "completions/mean_length": 1432.515625, | |
| "completions/mean_terminated_length": 1432.515625, | |
| "completions/min_length": 408.0, | |
| "completions/min_terminated_length": 408.0, | |
| "entropy": 0.1410405244678259, | |
| "epoch": 0.00854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5101809501647949, | |
| "kl": 0.6386174689978361, | |
| "learning_rate": 9.999716891357746e-05, | |
| "loss": 0.0369, | |
| "num_tokens": 24903364.0, | |
| "reward": 11.803701400756836, | |
| "reward_std": 16.973173141479492, | |
| "rewards/rollout_reward_func/mean": 11.803701400756836, | |
| "rewards/rollout_reward_func/std": 17.966468811035156, | |
| "sampling/importance_sampling_ratio/max": 1.7738006114959717, | |
| "sampling/importance_sampling_ratio/mean": 0.995194137096405, | |
| "sampling/importance_sampling_ratio/min": 0.6213434338569641, | |
| "sampling/sampling_logp_difference/max": 0.5084433555603027, | |
| "sampling/sampling_logp_difference/mean": 0.007640195079147816, | |
| "step": 427, | |
| "step_time": 42.486011768000026 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02976190554909408, | |
| "clip_ratio/high_mean": 0.01116071455180645, | |
| "clip_ratio/low_mean": 0.012369791802484542, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023530506470706314, | |
| "entropy": 0.14208506979048252, | |
| "epoch": 0.00856, | |
| "grad_norm": 0.2106575071811676, | |
| "kl": 0.6240573097020388, | |
| "learning_rate": 9.999715441398214e-05, | |
| "loss": 0.0308, | |
| "step": 428, | |
| "step_time": 10.646923483993305 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0007440476329065859, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1573.0, | |
| "completions/max_terminated_length": 1573.0, | |
| "completions/mean_length": 1438.828125, | |
| "completions/mean_terminated_length": 1438.828125, | |
| "completions/min_length": 1255.0, | |
| "completions/min_terminated_length": 1255.0, | |
| "entropy": 0.14296143036335707, | |
| "epoch": 0.00858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.42578843235969543, | |
| "kl": 0.5468210577964783, | |
| "learning_rate": 9.999713987735269e-05, | |
| "loss": 0.0008, | |
| "num_tokens": 25046668.0, | |
| "reward": 12.157367706298828, | |
| "reward_std": 19.82905387878418, | |
| "rewards/rollout_reward_func/mean": 12.157367706298828, | |
| "rewards/rollout_reward_func/std": 20.11625862121582, | |
| "sampling/importance_sampling_ratio/max": 1.1917697191238403, | |
| "sampling/importance_sampling_ratio/mean": 0.988789439201355, | |
| "sampling/importance_sampling_ratio/min": 0.6782960295677185, | |
| "sampling/sampling_logp_difference/max": 0.32637321949005127, | |
| "sampling/sampling_logp_difference/mean": 0.006113000214099884, | |
| "step": 429, | |
| "step_time": 40.68629942800362 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023809524485841393, | |
| "clip_ratio/high_mean": 0.007440476503688842, | |
| "clip_ratio/low_mean": 0.01045386923942715, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.017894345801323652, | |
| "entropy": 0.14418638544157147, | |
| "epoch": 0.0086, | |
| "grad_norm": 0.268960177898407, | |
| "kl": 0.5387851055711508, | |
| "learning_rate": 9.999712530368912e-05, | |
| "loss": -0.0055, | |
| "step": 430, | |
| "step_time": 11.072718907002127 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0007440476329065859, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1550.0, | |
| "completions/max_terminated_length": 1550.0, | |
| "completions/mean_length": 1429.546875, | |
| "completions/mean_terminated_length": 1429.546875, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "entropy": 0.15179488621652126, | |
| "epoch": 0.00862, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7485275864601135, | |
| "kl": 0.5430763624608517, | |
| "learning_rate": 9.999711069299146e-05, | |
| "loss": -0.0808, | |
| "num_tokens": 25189448.0, | |
| "reward": 11.358131408691406, | |
| "reward_std": 17.856586456298828, | |
| "rewards/rollout_reward_func/mean": 11.358131408691406, | |
| "rewards/rollout_reward_func/std": 18.32318878173828, | |
| "sampling/importance_sampling_ratio/max": 1.3345392942428589, | |
| "sampling/importance_sampling_ratio/mean": 1.0265988111495972, | |
| "sampling/importance_sampling_ratio/min": 0.48013654351234436, | |
| "sampling/sampling_logp_difference/max": 0.7489854097366333, | |
| "sampling/sampling_logp_difference/mean": 0.008107547648251057, | |
| "step": 431, | |
| "step_time": 40.926112169998305 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0654761919286102, | |
| "clip_ratio/high_mean": 0.02008928614668548, | |
| "clip_ratio/low_mean": 0.017782738606911153, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.037872024811804295, | |
| "entropy": 0.15683973440900445, | |
| "epoch": 0.00864, | |
| "grad_norm": 0.2219485342502594, | |
| "kl": 0.5109246261417866, | |
| "learning_rate": 9.99970960452597e-05, | |
| "loss": -0.0914, | |
| "step": 432, | |
| "step_time": 10.182908312996005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1552.0, | |
| "completions/max_terminated_length": 1552.0, | |
| "completions/mean_length": 1470.21875, | |
| "completions/mean_terminated_length": 1470.21875, | |
| "completions/min_length": 1344.0, | |
| "completions/min_terminated_length": 1344.0, | |
| "entropy": 0.14491091342642903, | |
| "epoch": 0.00866, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.46287354826927185, | |
| "kl": 0.5186197776347399, | |
| "learning_rate": 9.999708136049389e-05, | |
| "loss": -0.0113, | |
| "num_tokens": 25334849.0, | |
| "reward": 10.764678955078125, | |
| "reward_std": 13.417325973510742, | |
| "rewards/rollout_reward_func/mean": 10.764678955078125, | |
| "rewards/rollout_reward_func/std": 14.159459114074707, | |
| "sampling/importance_sampling_ratio/max": 1.4547772407531738, | |
| "sampling/importance_sampling_ratio/mean": 1.0081079006195068, | |
| "sampling/importance_sampling_ratio/min": 0.7049920558929443, | |
| "sampling/sampling_logp_difference/max": 0.4709939956665039, | |
| "sampling/sampling_logp_difference/mean": 0.005667536519467831, | |
| "step": 433, | |
| "step_time": 42.1604533589998 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0476190485060215, | |
| "clip_ratio/high_mean": 0.014136905199848115, | |
| "clip_ratio/low_mean": 0.01785714365541935, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03199404844781384, | |
| "entropy": 0.15482168877497315, | |
| "epoch": 0.00868, | |
| "grad_norm": 0.2332436740398407, | |
| "kl": 0.5066223796457052, | |
| "learning_rate": 9.9997066638694e-05, | |
| "loss": -0.0183, | |
| "step": 434, | |
| "step_time": 10.119833896998898 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1575.0, | |
| "completions/max_terminated_length": 1575.0, | |
| "completions/mean_length": 1436.375, | |
| "completions/mean_terminated_length": 1436.375, | |
| "completions/min_length": 795.0, | |
| "completions/min_terminated_length": 795.0, | |
| "entropy": 0.16640883032232523, | |
| "epoch": 0.0087, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5225183367729187, | |
| "kl": 0.4759600590914488, | |
| "learning_rate": 9.999705187986009e-05, | |
| "loss": 0.0044, | |
| "num_tokens": 25478062.0, | |
| "reward": 11.862446784973145, | |
| "reward_std": 14.980566024780273, | |
| "rewards/rollout_reward_func/mean": 11.862445831298828, | |
| "rewards/rollout_reward_func/std": 15.403722763061523, | |
| "sampling/importance_sampling_ratio/max": 1.3265814781188965, | |
| "sampling/importance_sampling_ratio/mean": 1.0067017078399658, | |
| "sampling/importance_sampling_ratio/min": 0.6984032988548279, | |
| "sampling/sampling_logp_difference/max": 0.3158724308013916, | |
| "sampling/sampling_logp_difference/mean": 0.007522557862102985, | |
| "step": 435, | |
| "step_time": 40.53673548099687 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08556547830812633, | |
| "clip_ratio/high_mean": 0.028087798331398517, | |
| "clip_ratio/low_mean": 0.025297619868069887, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05338541802484542, | |
| "entropy": 0.166918208822608, | |
| "epoch": 0.00872, | |
| "grad_norm": 0.5592331886291504, | |
| "kl": 0.46205065958201885, | |
| "learning_rate": 9.999703708399215e-05, | |
| "loss": -0.0001, | |
| "step": 436, | |
| "step_time": 10.790453629004332 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017857143189758062, | |
| "clip_ratio/high_mean": 0.004464285797439516, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334303461015, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1542.0, | |
| "completions/max_terminated_length": 1542.0, | |
| "completions/mean_length": 1430.21875, | |
| "completions/mean_terminated_length": 1430.21875, | |
| "completions/min_length": 1031.0, | |
| "completions/min_terminated_length": 1031.0, | |
| "entropy": 0.1512767318636179, | |
| "epoch": 0.00874, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5887247920036316, | |
| "kl": 0.47883218713104725, | |
| "learning_rate": 9.99970222510902e-05, | |
| "loss": 0.023, | |
| "num_tokens": 25620798.0, | |
| "reward": 10.20716667175293, | |
| "reward_std": 16.14691734313965, | |
| "rewards/rollout_reward_func/mean": 10.20716667175293, | |
| "rewards/rollout_reward_func/std": 17.900371551513672, | |
| "sampling/importance_sampling_ratio/max": 1.2416183948516846, | |
| "sampling/importance_sampling_ratio/mean": 0.9807419776916504, | |
| "sampling/importance_sampling_ratio/min": 0.542736291885376, | |
| "sampling/sampling_logp_difference/max": 0.36902284622192383, | |
| "sampling/sampling_logp_difference/mean": 0.0073195262812078, | |
| "step": 437, | |
| "step_time": 40.27251563699974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.059523811331018806, | |
| "clip_ratio/high_mean": 0.02306547691114247, | |
| "clip_ratio/low_mean": 0.03698593232547864, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06005140976049006, | |
| "entropy": 0.11152059538289905, | |
| "epoch": 0.00876, | |
| "grad_norm": 0.34218233823776245, | |
| "kl": 0.599434606730938, | |
| "learning_rate": 9.999700738115424e-05, | |
| "loss": 0.0208, | |
| "step": 438, | |
| "step_time": 10.141322578992913 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571594879031, | |
| "clip_ratio/high_mean": 0.002232142898719758, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029761905316263437, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1551.0, | |
| "completions/max_terminated_length": 1551.0, | |
| "completions/mean_length": 1448.5625, | |
| "completions/mean_terminated_length": 1448.5625, | |
| "completions/min_length": 1357.0, | |
| "completions/min_terminated_length": 1357.0, | |
| "entropy": 0.09335534879937768, | |
| "epoch": 0.00878, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5081444382667542, | |
| "kl": 0.5331121180206537, | |
| "learning_rate": 9.999699247418432e-05, | |
| "loss": -0.0063, | |
| "num_tokens": 25764758.0, | |
| "reward": 9.246360778808594, | |
| "reward_std": 12.59730339050293, | |
| "rewards/rollout_reward_func/mean": 9.246360778808594, | |
| "rewards/rollout_reward_func/std": 14.430070877075195, | |
| "sampling/importance_sampling_ratio/max": 1.47153902053833, | |
| "sampling/importance_sampling_ratio/mean": 0.9984990358352661, | |
| "sampling/importance_sampling_ratio/min": 0.582763671875, | |
| "sampling/sampling_logp_difference/max": 0.4040945768356323, | |
| "sampling/sampling_logp_difference/mean": 0.0051497891545295715, | |
| "step": 439, | |
| "step_time": 41.78158714499841 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833333721384406, | |
| "clip_ratio/high_mean": 0.0052083334303461015, | |
| "clip_ratio/low_mean": 0.02083333401242271, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026041667442768812, | |
| "entropy": 0.07112342561595142, | |
| "epoch": 0.0088, | |
| "grad_norm": 0.41764187812805176, | |
| "kl": 0.8426203690469265, | |
| "learning_rate": 9.999697753018041e-05, | |
| "loss": -0.0085, | |
| "step": 440, | |
| "step_time": 10.17990355800066 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571594879031, | |
| "clip_ratio/high_mean": 0.0029761905316263437, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334303461015, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1552.0, | |
| "completions/max_terminated_length": 1552.0, | |
| "completions/mean_length": 1425.234375, | |
| "completions/mean_terminated_length": 1425.234375, | |
| "completions/min_length": 690.0, | |
| "completions/min_terminated_length": 690.0, | |
| "entropy": 0.07502732030116022, | |
| "epoch": 0.00882, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4730430841445923, | |
| "kl": 0.598696194589138, | |
| "learning_rate": 9.999696254914256e-05, | |
| "loss": -0.0232, | |
| "num_tokens": 25907211.0, | |
| "reward": 12.0460205078125, | |
| "reward_std": 12.864827156066895, | |
| "rewards/rollout_reward_func/mean": 12.0460205078125, | |
| "rewards/rollout_reward_func/std": 13.124265670776367, | |
| "sampling/importance_sampling_ratio/max": 2.117748975753784, | |
| "sampling/importance_sampling_ratio/mean": 0.979032039642334, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2621982097625732, | |
| "sampling/sampling_logp_difference/mean": 0.006923416629433632, | |
| "step": 441, | |
| "step_time": 40.17172135200235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833333721384406, | |
| "clip_ratio/high_mean": 0.006696428754366934, | |
| "clip_ratio/low_mean": 0.010491071618162096, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01718750043073669, | |
| "entropy": 0.07272043719422072, | |
| "epoch": 0.00884, | |
| "grad_norm": 0.2536933422088623, | |
| "kl": 0.6081040930002928, | |
| "learning_rate": 9.999694753107076e-05, | |
| "loss": -0.0288, | |
| "step": 442, | |
| "step_time": 10.609227344999454 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0007440476329065859, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1551.0, | |
| "completions/max_terminated_length": 1551.0, | |
| "completions/mean_length": 1433.484375, | |
| "completions/mean_terminated_length": 1433.484375, | |
| "completions/min_length": 637.0, | |
| "completions/min_terminated_length": 637.0, | |
| "entropy": 0.08062643301673234, | |
| "epoch": 0.00886, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8294975757598877, | |
| "kl": 0.5611858777701855, | |
| "learning_rate": 9.999693247596505e-05, | |
| "loss": 0.0316, | |
| "num_tokens": 26050176.0, | |
| "reward": 9.822164535522461, | |
| "reward_std": 14.750000953674316, | |
| "rewards/rollout_reward_func/mean": 9.822165489196777, | |
| "rewards/rollout_reward_func/std": 14.6282377243042, | |
| "sampling/importance_sampling_ratio/max": 1.5190024375915527, | |
| "sampling/importance_sampling_ratio/mean": 1.0036146640777588, | |
| "sampling/importance_sampling_ratio/min": 0.7604562640190125, | |
| "sampling/sampling_logp_difference/max": 0.3031894564628601, | |
| "sampling/sampling_logp_difference/mean": 0.004106580279767513, | |
| "step": 443, | |
| "step_time": 40.79559905699534 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.030257937032729387, | |
| "clip_ratio/high_mean": 0.007564484258182347, | |
| "clip_ratio/low_mean": 0.015591179952025414, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02315566421020776, | |
| "entropy": 0.0824263768736273, | |
| "epoch": 0.00888, | |
| "grad_norm": 0.9322162866592407, | |
| "kl": 0.7285797223448753, | |
| "learning_rate": 9.999691738382544e-05, | |
| "loss": 0.034, | |
| "step": 444, | |
| "step_time": 10.72277228299754 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0007440476329065859, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1574.0, | |
| "completions/max_terminated_length": 1574.0, | |
| "completions/mean_length": 1428.421875, | |
| "completions/mean_terminated_length": 1428.421875, | |
| "completions/min_length": 1166.0, | |
| "completions/min_terminated_length": 1166.0, | |
| "entropy": 0.07391244731843472, | |
| "epoch": 0.0089, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5232189893722534, | |
| "kl": 0.6245864983648062, | |
| "learning_rate": 9.999690225465193e-05, | |
| "loss": -0.0215, | |
| "num_tokens": 26192780.0, | |
| "reward": 11.33067512512207, | |
| "reward_std": 15.117729187011719, | |
| "rewards/rollout_reward_func/mean": 11.33067512512207, | |
| "rewards/rollout_reward_func/std": 16.229934692382812, | |
| "sampling/importance_sampling_ratio/max": 1.4000767469406128, | |
| "sampling/importance_sampling_ratio/mean": 1.0228557586669922, | |
| "sampling/importance_sampling_ratio/min": 0.8148965239524841, | |
| "sampling/sampling_logp_difference/max": 0.303769588470459, | |
| "sampling/sampling_logp_difference/mean": 0.003096876898780465, | |
| "step": 445, | |
| "step_time": 39.69306251100352 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833333721384406, | |
| "clip_ratio/high_mean": 0.0059523810632526875, | |
| "clip_ratio/low_mean": 0.011408730410039425, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.017361111531499773, | |
| "entropy": 0.07312626042403281, | |
| "epoch": 0.00892, | |
| "grad_norm": 0.2677549719810486, | |
| "kl": 0.6443136036396027, | |
| "learning_rate": 9.999688708844453e-05, | |
| "loss": -0.0254, | |
| "step": 446, | |
| "step_time": 9.859747254999093 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0064484127797186375, | |
| "clip_ratio/high_mean": 0.0023561508278362453, | |
| "clip_ratio/low_mean": 0.0014542749268002808, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003810425754636526, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1503.0, | |
| "completions/max_terminated_length": 1503.0, | |
| "completions/mean_length": 1392.1875, | |
| "completions/mean_terminated_length": 1392.1875, | |
| "completions/min_length": 1207.0, | |
| "completions/min_terminated_length": 1207.0, | |
| "entropy": 0.10388755868189037, | |
| "epoch": 0.00894, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.44962024688720703, | |
| "kl": 0.5700237862765789, | |
| "learning_rate": 9.999687188520327e-05, | |
| "loss": -0.0085, | |
| "num_tokens": 26333000.0, | |
| "reward": 10.396234512329102, | |
| "reward_std": 12.773336410522461, | |
| "rewards/rollout_reward_func/mean": 10.396234512329102, | |
| "rewards/rollout_reward_func/std": 13.91511058807373, | |
| "sampling/importance_sampling_ratio/max": 1.2538983821868896, | |
| "sampling/importance_sampling_ratio/mean": 1.0106072425842285, | |
| "sampling/importance_sampling_ratio/min": 0.8617662787437439, | |
| "sampling/sampling_logp_difference/max": 0.21103119850158691, | |
| "sampling/sampling_logp_difference/mean": 0.004161643795669079, | |
| "step": 447, | |
| "step_time": 40.40724292899722 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017215219675563276, | |
| "clip_ratio/high_mean": 0.005047852551797405, | |
| "clip_ratio/low_mean": 0.011870941845700145, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01691879451391287, | |
| "entropy": 0.10565289529040456, | |
| "epoch": 0.00896, | |
| "grad_norm": 0.3113742470741272, | |
| "kl": 0.5588793251663446, | |
| "learning_rate": 9.999685664492817e-05, | |
| "loss": -0.011, | |
| "step": 448, | |
| "step_time": 9.88399511400712 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.0029761905316263437, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004464285797439516, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1539.0, | |
| "completions/max_terminated_length": 1539.0, | |
| "completions/mean_length": 1425.671875, | |
| "completions/mean_terminated_length": 1425.671875, | |
| "completions/min_length": 1252.0, | |
| "completions/min_terminated_length": 1252.0, | |
| "entropy": 0.08549337997101247, | |
| "epoch": 0.00898, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.43871885538101196, | |
| "kl": 0.5197541080415249, | |
| "learning_rate": 9.999684136761923e-05, | |
| "loss": 0.0424, | |
| "num_tokens": 26475423.0, | |
| "reward": 13.137186050415039, | |
| "reward_std": 18.040781021118164, | |
| "rewards/rollout_reward_func/mean": 13.137186050415039, | |
| "rewards/rollout_reward_func/std": 18.348669052124023, | |
| "sampling/importance_sampling_ratio/max": 2.0688071250915527, | |
| "sampling/importance_sampling_ratio/mean": 1.0355302095413208, | |
| "sampling/importance_sampling_ratio/min": 0.7141319513320923, | |
| "sampling/sampling_logp_difference/max": 0.7877845764160156, | |
| "sampling/sampling_logp_difference/mean": 0.004831024445593357, | |
| "step": 449, | |
| "step_time": 40.3097439919984 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02380952425301075, | |
| "clip_ratio/high_mean": 0.00744047638727352, | |
| "clip_ratio/low_mean": 0.010349026299081743, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01778950251173228, | |
| "entropy": 0.09008124680258334, | |
| "epoch": 0.009, | |
| "grad_norm": 0.2819499969482422, | |
| "kl": 0.48992327228188515, | |
| "learning_rate": 9.999682605327648e-05, | |
| "loss": 0.0377, | |
| "step": 450, | |
| "step_time": 11.019723427001736 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571594879031, | |
| "clip_ratio/high_mean": 0.002232142898719758, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029761905316263437, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1561.0, | |
| "completions/max_terminated_length": 1561.0, | |
| "completions/mean_length": 1437.046875, | |
| "completions/mean_terminated_length": 1437.046875, | |
| "completions/min_length": 686.0, | |
| "completions/min_terminated_length": 686.0, | |
| "entropy": 0.09430601261556149, | |
| "epoch": 0.00902, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.39097362756729126, | |
| "kl": 0.5122922882437706, | |
| "learning_rate": 9.99968107018999e-05, | |
| "loss": -0.0447, | |
| "num_tokens": 26618636.0, | |
| "reward": 10.664965629577637, | |
| "reward_std": 12.413619995117188, | |
| "rewards/rollout_reward_func/mean": 10.664965629577637, | |
| "rewards/rollout_reward_func/std": 12.955881118774414, | |
| "sampling/importance_sampling_ratio/max": 1.1989917755126953, | |
| "sampling/importance_sampling_ratio/mean": 0.9830008745193481, | |
| "sampling/importance_sampling_ratio/min": 0.5060357451438904, | |
| "sampling/sampling_logp_difference/max": 0.3329579830169678, | |
| "sampling/sampling_logp_difference/mean": 0.004485957324504852, | |
| "step": 451, | |
| "step_time": 39.55284725899946 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017857143189758062, | |
| "clip_ratio/high_mean": 0.0052083334303461015, | |
| "clip_ratio/low_mean": 0.009709821664728224, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.014918155211489648, | |
| "entropy": 0.09947029640898108, | |
| "epoch": 0.00904, | |
| "grad_norm": 0.2647772431373596, | |
| "kl": 0.501507306471467, | |
| "learning_rate": 9.999679531348955e-05, | |
| "loss": -0.0474, | |
| "step": 452, | |
| "step_time": 9.83529582600022 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0029761905316263437, | |
| "clip_ratio/high_mean": 0.0007440476329065859, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014880952658131719, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1576.0, | |
| "completions/max_terminated_length": 1576.0, | |
| "completions/mean_length": 1483.796875, | |
| "completions/mean_terminated_length": 1483.796875, | |
| "completions/min_length": 1354.0, | |
| "completions/min_terminated_length": 1354.0, | |
| "entropy": 0.08577556139789522, | |
| "epoch": 0.00906, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4616855978965759, | |
| "kl": 0.4984573759138584, | |
| "learning_rate": 9.999677988804543e-05, | |
| "loss": 0.0129, | |
| "num_tokens": 26764995.0, | |
| "reward": 12.713988304138184, | |
| "reward_std": 16.157230377197266, | |
| "rewards/rollout_reward_func/mean": 12.713988304138184, | |
| "rewards/rollout_reward_func/std": 17.417678833007812, | |
| "sampling/importance_sampling_ratio/max": 1.2561296224594116, | |
| "sampling/importance_sampling_ratio/mean": 1.0040102005004883, | |
| "sampling/importance_sampling_ratio/min": 0.5851351618766785, | |
| "sampling/sampling_logp_difference/max": 0.335345983505249, | |
| "sampling/sampling_logp_difference/mean": 0.004758521914482117, | |
| "step": 453, | |
| "step_time": 42.06172357400101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.014880952658131719, | |
| "clip_ratio/high_mean": 0.004464285797439516, | |
| "clip_ratio/low_mean": 0.015625000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.020089286321308464, | |
| "entropy": 0.0774516521487385, | |
| "epoch": 0.00908, | |
| "grad_norm": 0.13414981961250305, | |
| "kl": 0.5282110534608364, | |
| "learning_rate": 9.999676442556757e-05, | |
| "loss": 0.0065, | |
| "step": 454, | |
| "step_time": 10.263705699999264 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018005952704697847, | |
| "clip_ratio/high_mean": 0.005245535809081048, | |
| "clip_ratio/low_mean": 0.002232142898719758, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007477678707800806, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1557.0, | |
| "completions/max_terminated_length": 1557.0, | |
| "completions/mean_length": 1432.234375, | |
| "completions/mean_terminated_length": 1432.234375, | |
| "completions/min_length": 777.0, | |
| "completions/min_terminated_length": 777.0, | |
| "entropy": 0.0869816429913044, | |
| "epoch": 0.0091, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5139105319976807, | |
| "kl": 0.5019301455467939, | |
| "learning_rate": 9.999674892605595e-05, | |
| "loss": -0.0143, | |
| "num_tokens": 26907877.0, | |
| "reward": 14.470987319946289, | |
| "reward_std": 12.551952362060547, | |
| "rewards/rollout_reward_func/mean": 14.470987319946289, | |
| "rewards/rollout_reward_func/std": 13.231359481811523, | |
| "sampling/importance_sampling_ratio/max": 1.4351580142974854, | |
| "sampling/importance_sampling_ratio/mean": 0.9842495918273926, | |
| "sampling/importance_sampling_ratio/min": 0.7047746181488037, | |
| "sampling/sampling_logp_difference/max": 0.36011219024658203, | |
| "sampling/sampling_logp_difference/mean": 0.005461296532303095, | |
| "step": 455, | |
| "step_time": 41.37734428300246 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.036011905409395695, | |
| "clip_ratio/high_mean": 0.012723214633297175, | |
| "clip_ratio/low_mean": 0.01116071455180645, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023883929243311286, | |
| "entropy": 0.07720525958575308, | |
| "epoch": 0.00912, | |
| "grad_norm": 0.3397330045700073, | |
| "kl": 0.6114528980106115, | |
| "learning_rate": 9.99967333895106e-05, | |
| "loss": -0.0171, | |
| "step": 456, | |
| "step_time": 10.617095338997387 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009077381109818816, | |
| "clip_ratio/high_mean": 0.002269345277454704, | |
| "clip_ratio/low_mean": 0.0037202381645329297, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005989583441987634, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1556.0, | |
| "completions/max_terminated_length": 1556.0, | |
| "completions/mean_length": 1450.328125, | |
| "completions/mean_terminated_length": 1450.328125, | |
| "completions/min_length": 677.0, | |
| "completions/min_terminated_length": 677.0, | |
| "entropy": 0.08630289603024721, | |
| "epoch": 0.00914, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.915941059589386, | |
| "kl": 0.5304882265627384, | |
| "learning_rate": 9.999671781593154e-05, | |
| "loss": -0.0128, | |
| "num_tokens": 27051977.0, | |
| "reward": 12.1441650390625, | |
| "reward_std": 13.508443832397461, | |
| "rewards/rollout_reward_func/mean": 12.1441650390625, | |
| "rewards/rollout_reward_func/std": 14.862476348876953, | |
| "sampling/importance_sampling_ratio/max": 1.8943538665771484, | |
| "sampling/importance_sampling_ratio/mean": 1.0383222103118896, | |
| "sampling/importance_sampling_ratio/min": 0.6029430031776428, | |
| "sampling/sampling_logp_difference/max": 0.5262751579284668, | |
| "sampling/sampling_logp_difference/mean": 0.0063569676131010056, | |
| "step": 457, | |
| "step_time": 40.44359572299618 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026785714784637094, | |
| "clip_ratio/high_mean": 0.00889475119765848, | |
| "clip_ratio/low_mean": 0.015625000349245965, | |
| "clip_ratio/low_min": 0.0029761905316263437, | |
| "clip_ratio/region_mean": 0.024519751546904445, | |
| "entropy": 0.07588907447643578, | |
| "epoch": 0.00916, | |
| "grad_norm": 0.3780209422111511, | |
| "kl": 0.5850545484572649, | |
| "learning_rate": 9.999670220531878e-05, | |
| "loss": -0.0142, | |
| "step": 458, | |
| "step_time": 10.81988593099959 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.0007812500116415322, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002269345277454704, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1548.0, | |
| "completions/max_terminated_length": 1548.0, | |
| "completions/mean_length": 1446.8125, | |
| "completions/mean_terminated_length": 1446.8125, | |
| "completions/min_length": 708.0, | |
| "completions/min_terminated_length": 708.0, | |
| "entropy": 0.06712023681029677, | |
| "epoch": 0.00918, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5037431716918945, | |
| "kl": 0.5365529656410217, | |
| "learning_rate": 9.999668655767235e-05, | |
| "loss": -0.0142, | |
| "num_tokens": 27195924.0, | |
| "reward": 12.623528480529785, | |
| "reward_std": 16.375185012817383, | |
| "rewards/rollout_reward_func/mean": 12.623528480529785, | |
| "rewards/rollout_reward_func/std": 17.157840728759766, | |
| "sampling/importance_sampling_ratio/max": 1.4218283891677856, | |
| "sampling/importance_sampling_ratio/mean": 1.0120244026184082, | |
| "sampling/importance_sampling_ratio/min": 0.7264562249183655, | |
| "sampling/sampling_logp_difference/max": 0.36830270290374756, | |
| "sampling/sampling_logp_difference/mean": 0.003537567099556327, | |
| "step": 459, | |
| "step_time": 39.789260978999664 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020833333721384406, | |
| "clip_ratio/high_mean": 0.0052083334303461015, | |
| "clip_ratio/low_mean": 0.010230655025225133, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015438988397363573, | |
| "entropy": 0.061492747627198696, | |
| "epoch": 0.0092, | |
| "grad_norm": 0.27846819162368774, | |
| "kl": 0.6263625603169203, | |
| "learning_rate": 9.999667087299225e-05, | |
| "loss": -0.0179, | |
| "step": 460, | |
| "step_time": 10.157873148000363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011904762126505375, | |
| "clip_ratio/high_mean": 0.0037202381645329297, | |
| "clip_ratio/low_mean": 0.0007440476329065859, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004464285797439516, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1553.0, | |
| "completions/max_terminated_length": 1553.0, | |
| "completions/mean_length": 1393.921875, | |
| "completions/mean_terminated_length": 1393.921875, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "entropy": 0.06535098806489259, | |
| "epoch": 0.00922, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6925691366195679, | |
| "kl": 0.5965993329882622, | |
| "learning_rate": 9.99966551512785e-05, | |
| "loss": -0.0133, | |
| "num_tokens": 27336352.0, | |
| "reward": 8.150674819946289, | |
| "reward_std": 15.653514862060547, | |
| "rewards/rollout_reward_func/mean": 8.150674819946289, | |
| "rewards/rollout_reward_func/std": 16.096240997314453, | |
| "sampling/importance_sampling_ratio/max": 1.3934406042099, | |
| "sampling/importance_sampling_ratio/mean": 0.9711774587631226, | |
| "sampling/importance_sampling_ratio/min": 0.3346167504787445, | |
| "sampling/sampling_logp_difference/max": 1.0496406555175781, | |
| "sampling/sampling_logp_difference/mean": 0.005856034811586142, | |
| "step": 461, | |
| "step_time": 41.696121679995485 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023958333767950535, | |
| "clip_ratio/high_mean": 0.00673363107489422, | |
| "clip_ratio/low_mean": 0.015252976503688842, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021986607811413705, | |
| "entropy": 0.0650356519035995, | |
| "epoch": 0.00924, | |
| "grad_norm": 1.2314876317977905, | |
| "kl": 1.7299257963895798, | |
| "learning_rate": 9.999663939253112e-05, | |
| "loss": -0.0022, | |
| "step": 462, | |
| "step_time": 10.117755536000914 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009077381109818816, | |
| "clip_ratio/high_mean": 0.002269345277454704, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003757440543267876, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1543.0, | |
| "completions/max_terminated_length": 1543.0, | |
| "completions/mean_length": 1386.734375, | |
| "completions/mean_terminated_length": 1386.734375, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "entropy": 0.08255739836022258, | |
| "epoch": 0.00926, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4534408450126648, | |
| "kl": 0.5308241080492735, | |
| "learning_rate": 9.999662359675012e-05, | |
| "loss": -0.0123, | |
| "num_tokens": 27476234.0, | |
| "reward": 11.247259140014648, | |
| "reward_std": 14.853042602539062, | |
| "rewards/rollout_reward_func/mean": 11.247259140014648, | |
| "rewards/rollout_reward_func/std": 14.736608505249023, | |
| "sampling/importance_sampling_ratio/max": 1.3197416067123413, | |
| "sampling/importance_sampling_ratio/mean": 0.9946113228797913, | |
| "sampling/importance_sampling_ratio/min": 0.7106093764305115, | |
| "sampling/sampling_logp_difference/max": 0.3446381092071533, | |
| "sampling/sampling_logp_difference/mean": 0.00609009200707078, | |
| "step": 463, | |
| "step_time": 40.418589189997874 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026934524532407522, | |
| "clip_ratio/high_mean": 0.009709821664728224, | |
| "clip_ratio/low_mean": 0.01640625053551048, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026116072200238705, | |
| "entropy": 0.09197275433689356, | |
| "epoch": 0.00928, | |
| "grad_norm": 0.46926549077033997, | |
| "kl": 0.5417319964617491, | |
| "learning_rate": 9.999660776393552e-05, | |
| "loss": -0.0111, | |
| "step": 464, | |
| "step_time": 10.364104637999844 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0059523810632526875, | |
| "clip_ratio/high_mean": 0.0014880952658131719, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0029761905316263437, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1538.0, | |
| "completions/max_terminated_length": 1538.0, | |
| "completions/mean_length": 1408.109375, | |
| "completions/mean_terminated_length": 1408.109375, | |
| "completions/min_length": 788.0, | |
| "completions/min_terminated_length": 788.0, | |
| "entropy": 0.0891355937346816, | |
| "epoch": 0.0093, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9044552445411682, | |
| "kl": 0.6998987477272749, | |
| "learning_rate": 9.999659189408731e-05, | |
| "loss": -0.0085, | |
| "num_tokens": 27617505.0, | |
| "reward": 13.215154647827148, | |
| "reward_std": 11.782221794128418, | |
| "rewards/rollout_reward_func/mean": 13.215155601501465, | |
| "rewards/rollout_reward_func/std": 12.105838775634766, | |
| "sampling/importance_sampling_ratio/max": 1.657700777053833, | |
| "sampling/importance_sampling_ratio/mean": 1.0080327987670898, | |
| "sampling/importance_sampling_ratio/min": 0.5086445808410645, | |
| "sampling/sampling_logp_difference/max": 0.6424019932746887, | |
| "sampling/sampling_logp_difference/mean": 0.005240763537585735, | |
| "step": 465, | |
| "step_time": 39.89362095000433 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 27617505, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |