Instructions to use Gege24/gin_rummy_4G with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/gin_rummy_4G with PEFT:
Base model is not found.
- Transformers
How to use Gege24/gin_rummy_4G with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/gin_rummy_4G") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Gege24/gin_rummy_4G", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gege24/gin_rummy_4G with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/gin_rummy_4G" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/gin_rummy_4G", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/gin_rummy_4G
- SGLang
How to use Gege24/gin_rummy_4G with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/gin_rummy_4G" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/gin_rummy_4G", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/gin_rummy_4G" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/gin_rummy_4G", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/gin_rummy_4G with Docker Model Runner:
docker model run hf.co/Gege24/gin_rummy_4G
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.006, | |
| "eval_steps": 500, | |
| "global_step": 75, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2783.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 2052.75, | |
| "completions/mean_terminated_length": 2052.75, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.4163087382912636, | |
| "epoch": 8e-05, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.8663769960403442, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0386, | |
| "num_tokens": 78630.0, | |
| "reward": 0.46406251192092896, | |
| "reward_std": 0.20054946839809418, | |
| "rewards/rollout_reward_func/mean": 0.46406251192092896, | |
| "rewards/rollout_reward_func/std": 0.37604784965515137, | |
| "sampling/importance_sampling_ratio/max": 2.1498024463653564, | |
| "sampling/importance_sampling_ratio/mean": 1.0975958108901978, | |
| "sampling/importance_sampling_ratio/min": 0.241215318441391, | |
| "sampling/sampling_logp_difference/max": 0.7405228614807129, | |
| "sampling/sampling_logp_difference/mean": 0.039819031953811646, | |
| "step": 1, | |
| "step_time": 14.418279634999976 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2798.0, | |
| "completions/max_terminated_length": 2798.0, | |
| "completions/mean_length": 2084.21875, | |
| "completions/mean_terminated_length": 2084.21875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3995310440659523, | |
| "epoch": 0.00016, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.142817735671997, | |
| "kl": 0.0, | |
| "learning_rate": 1.7142857142857143e-07, | |
| "loss": 0.016, | |
| "num_tokens": 158194.0, | |
| "reward": 0.3384375274181366, | |
| "reward_std": 0.16842570900917053, | |
| "rewards/rollout_reward_func/mean": 0.3384375274181366, | |
| "rewards/rollout_reward_func/std": 0.27340278029441833, | |
| "sampling/importance_sampling_ratio/max": 1.9602876901626587, | |
| "sampling/importance_sampling_ratio/mean": 0.992855966091156, | |
| "sampling/importance_sampling_ratio/min": 0.46628525853157043, | |
| "sampling/sampling_logp_difference/max": 0.6929764747619629, | |
| "sampling/sampling_logp_difference/mean": 0.04201715067028999, | |
| "step": 2, | |
| "step_time": 13.260429134000105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04315628902986646, | |
| "clip_ratio/high_mean": 0.012242560740560293, | |
| "clip_ratio/low_mean": 0.011964043835178018, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.024206604342907667, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2776.0, | |
| "completions/max_terminated_length": 2776.0, | |
| "completions/mean_length": 1875.09375, | |
| "completions/mean_terminated_length": 1875.09375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.38934508711099625, | |
| "epoch": 0.00024, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.2311601638793945, | |
| "kl": 0.003617420152295381, | |
| "learning_rate": 3.4285714285714286e-07, | |
| "loss": -0.0954, | |
| "num_tokens": 230320.0, | |
| "reward": 0.4612500071525574, | |
| "reward_std": 0.22380851209163666, | |
| "rewards/rollout_reward_func/mean": 0.4612500071525574, | |
| "rewards/rollout_reward_func/std": 0.3984546363353729, | |
| "sampling/importance_sampling_ratio/max": 1.6067352294921875, | |
| "sampling/importance_sampling_ratio/mean": 0.9242645502090454, | |
| "sampling/importance_sampling_ratio/min": 0.17279618978500366, | |
| "sampling/sampling_logp_difference/max": 1.4119317531585693, | |
| "sampling/sampling_logp_difference/mean": 0.045969706028699875, | |
| "step": 3, | |
| "step_time": 12.304947445999915 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023281023371964693, | |
| "clip_ratio/high_mean": 0.012716594734229147, | |
| "clip_ratio/low_mean": 0.01039634458720684, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023112939670681953, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2783.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 2251.84375, | |
| "completions/mean_terminated_length": 2251.84375, | |
| "completions/min_length": 1562.0, | |
| "completions/min_terminated_length": 1562.0, | |
| "entropy": 0.4188930094242096, | |
| "epoch": 0.00032, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.3432626724243164, | |
| "kl": 0.005323103512637317, | |
| "learning_rate": 5.142857142857143e-07, | |
| "loss": -0.1037, | |
| "num_tokens": 315875.0, | |
| "reward": 0.2640625238418579, | |
| "reward_std": 0.07438889145851135, | |
| "rewards/rollout_reward_func/mean": 0.2640625238418579, | |
| "rewards/rollout_reward_func/std": 0.09810657054185867, | |
| "sampling/importance_sampling_ratio/max": 2.92923903465271, | |
| "sampling/importance_sampling_ratio/mean": 1.0071074962615967, | |
| "sampling/importance_sampling_ratio/min": 0.30356213450431824, | |
| "sampling/sampling_logp_difference/max": 0.9253432750701904, | |
| "sampling/sampling_logp_difference/mean": 0.04933081567287445, | |
| "step": 4, | |
| "step_time": 13.287707804000092 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04239537985995412, | |
| "clip_ratio/high_mean": 0.018673060229048133, | |
| "clip_ratio/low_mean": 0.0042297979816794395, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.022902858443558216, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2790.0, | |
| "completions/max_terminated_length": 2790.0, | |
| "completions/mean_length": 2197.3125, | |
| "completions/mean_terminated_length": 2197.3125, | |
| "completions/min_length": 1559.0, | |
| "completions/min_terminated_length": 1559.0, | |
| "entropy": 0.4414307102560997, | |
| "epoch": 0.0004, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.470518112182617, | |
| "kl": 0.004553150560241193, | |
| "learning_rate": 6.857142857142857e-07, | |
| "loss": 0.1372, | |
| "num_tokens": 399370.0, | |
| "reward": 0.40281248092651367, | |
| "reward_std": 0.16662904620170593, | |
| "rewards/rollout_reward_func/mean": 0.40281248092651367, | |
| "rewards/rollout_reward_func/std": 0.3357921242713928, | |
| "sampling/importance_sampling_ratio/max": 2.2576870918273926, | |
| "sampling/importance_sampling_ratio/mean": 1.0002690553665161, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.462371826171875, | |
| "sampling/sampling_logp_difference/mean": 0.053694289177656174, | |
| "step": 5, | |
| "step_time": 13.068715858000132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02923969691619277, | |
| "clip_ratio/high_mean": 0.01021690119523555, | |
| "clip_ratio/low_mean": 0.01101089478470385, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021227796096354723, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2428.0, | |
| "completions/max_terminated_length": 2428.0, | |
| "completions/mean_length": 1826.09375, | |
| "completions/mean_terminated_length": 1826.09375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.37763065844774246, | |
| "epoch": 0.00048, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.4737789630889893, | |
| "kl": 0.003610707528423518, | |
| "learning_rate": 8.571428571428571e-07, | |
| "loss": 0.0212, | |
| "num_tokens": 469858.0, | |
| "reward": 0.4584375023841858, | |
| "reward_std": 0.2892817258834839, | |
| "rewards/rollout_reward_func/mean": 0.4584375023841858, | |
| "rewards/rollout_reward_func/std": 0.4035496413707733, | |
| "sampling/importance_sampling_ratio/max": 1.8672934770584106, | |
| "sampling/importance_sampling_ratio/mean": 0.9250987768173218, | |
| "sampling/importance_sampling_ratio/min": 0.2111542820930481, | |
| "sampling/sampling_logp_difference/max": 1.105020523071289, | |
| "sampling/sampling_logp_difference/mean": 0.04392547905445099, | |
| "step": 6, | |
| "step_time": 11.808095918000163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02163859363645315, | |
| "clip_ratio/high_mean": 0.007195362821221352, | |
| "clip_ratio/low_mean": 0.009288194705732167, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01648355764336884, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2799.0, | |
| "completions/max_terminated_length": 2799.0, | |
| "completions/mean_length": 2101.9375, | |
| "completions/mean_terminated_length": 2101.9375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.4031049609184265, | |
| "epoch": 0.00056, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.241011142730713, | |
| "kl": 0.004900285159237683, | |
| "learning_rate": 1.0285714285714286e-06, | |
| "loss": 0.0307, | |
| "num_tokens": 549695.0, | |
| "reward": 0.32218751311302185, | |
| "reward_std": 0.10592572391033173, | |
| "rewards/rollout_reward_func/mean": 0.32218751311302185, | |
| "rewards/rollout_reward_func/std": 0.22224271297454834, | |
| "sampling/importance_sampling_ratio/max": 2.7520875930786133, | |
| "sampling/importance_sampling_ratio/mean": 0.9687752723693848, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8389774560928345, | |
| "sampling/sampling_logp_difference/mean": 0.043909620493650436, | |
| "step": 7, | |
| "step_time": 13.085814365000147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029183519072830677, | |
| "clip_ratio/high_mean": 0.008625667076557875, | |
| "clip_ratio/low_mean": 0.016130636679008603, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02475630398839712, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2794.0, | |
| "completions/max_terminated_length": 2794.0, | |
| "completions/mean_length": 2250.71875, | |
| "completions/mean_terminated_length": 2250.71875, | |
| "completions/min_length": 1570.0, | |
| "completions/min_terminated_length": 1570.0, | |
| "entropy": 0.43513813614845276, | |
| "epoch": 0.00064, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.6571757793426514, | |
| "kl": 0.0038885354879312217, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": -0.0896, | |
| "num_tokens": 634822.0, | |
| "reward": 0.30375000834465027, | |
| "reward_std": 0.11063194274902344, | |
| "rewards/rollout_reward_func/mean": 0.30375000834465027, | |
| "rewards/rollout_reward_func/std": 0.22577106952667236, | |
| "sampling/importance_sampling_ratio/max": 2.24173641204834, | |
| "sampling/importance_sampling_ratio/mean": 0.9777867794036865, | |
| "sampling/importance_sampling_ratio/min": 0.4010058343410492, | |
| "sampling/sampling_logp_difference/max": 0.9179394245147705, | |
| "sampling/sampling_logp_difference/mean": 0.0499531514942646, | |
| "step": 8, | |
| "step_time": 13.055169309000007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.014742525294423103, | |
| "clip_ratio/high_mean": 0.003685631323605776, | |
| "clip_ratio/low_mean": 0.008176195668056607, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.011861827224493027, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2810.0, | |
| "completions/max_terminated_length": 2810.0, | |
| "completions/mean_length": 1657.03125, | |
| "completions/mean_terminated_length": 1657.03125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.39970337599515915, | |
| "epoch": 0.00072, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.901354432106018, | |
| "kl": 0.005712996702641249, | |
| "learning_rate": 1.3714285714285715e-06, | |
| "loss": 0.0004, | |
| "num_tokens": 699616.0, | |
| "reward": 0.3528124690055847, | |
| "reward_std": 0.20240315794944763, | |
| "rewards/rollout_reward_func/mean": 0.3528124690055847, | |
| "rewards/rollout_reward_func/std": 0.3597510755062103, | |
| "sampling/importance_sampling_ratio/max": 2.387613296508789, | |
| "sampling/importance_sampling_ratio/mean": 1.0771517753601074, | |
| "sampling/importance_sampling_ratio/min": 0.5435174703598022, | |
| "sampling/sampling_logp_difference/max": 0.6833771467208862, | |
| "sampling/sampling_logp_difference/mean": 0.04181923717260361, | |
| "step": 9, | |
| "step_time": 13.294292020000057 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03906210558488965, | |
| "clip_ratio/high_mean": 0.015391179244033992, | |
| "clip_ratio/low_mean": 0.0073633925057947636, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.022754571866244078, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2793.0, | |
| "completions/max_terminated_length": 2793.0, | |
| "completions/mean_length": 2214.53125, | |
| "completions/mean_terminated_length": 2214.53125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.41637370735406876, | |
| "epoch": 0.0008, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.541189670562744, | |
| "kl": 0.004748326842673123, | |
| "learning_rate": 1.5428571428571428e-06, | |
| "loss": -0.0168, | |
| "num_tokens": 783707.0, | |
| "reward": 0.4284375309944153, | |
| "reward_std": 0.1260128915309906, | |
| "rewards/rollout_reward_func/mean": 0.4284375309944153, | |
| "rewards/rollout_reward_func/std": 0.3622608780860901, | |
| "sampling/importance_sampling_ratio/max": 2.2261769771575928, | |
| "sampling/importance_sampling_ratio/mean": 1.042180061340332, | |
| "sampling/importance_sampling_ratio/min": 0.2320551723241806, | |
| "sampling/sampling_logp_difference/max": 1.021528959274292, | |
| "sampling/sampling_logp_difference/mean": 0.04730905592441559, | |
| "step": 10, | |
| "step_time": 13.637110585000073 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010990338400006294, | |
| "clip_ratio/high_mean": 0.0027475846000015736, | |
| "clip_ratio/low_mean": 0.0016025641234591603, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004350148723460734, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2793.0, | |
| "completions/max_terminated_length": 2793.0, | |
| "completions/mean_length": 1966.4375, | |
| "completions/mean_terminated_length": 1966.4375, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.4171219617128372, | |
| "epoch": 0.00088, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.4207873344421387, | |
| "kl": 0.0035471616429276764, | |
| "learning_rate": 1.7142857142857143e-06, | |
| "loss": 0.0422, | |
| "num_tokens": 858994.0, | |
| "reward": 0.4596875309944153, | |
| "reward_std": 0.14279377460479736, | |
| "rewards/rollout_reward_func/mean": 0.4596875309944153, | |
| "rewards/rollout_reward_func/std": 0.3725802004337311, | |
| "sampling/importance_sampling_ratio/max": 1.874053716659546, | |
| "sampling/importance_sampling_ratio/mean": 0.9027889966964722, | |
| "sampling/importance_sampling_ratio/min": 0.45684853196144104, | |
| "sampling/sampling_logp_difference/max": 0.5253086090087891, | |
| "sampling/sampling_logp_difference/mean": 0.0447448305785656, | |
| "step": 11, | |
| "step_time": 12.90678849699998 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.039288708940148354, | |
| "clip_ratio/high_mean": 0.017087680520489812, | |
| "clip_ratio/low_mean": 0.008439590455964208, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02552727097645402, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2782.0, | |
| "completions/max_terminated_length": 2782.0, | |
| "completions/mean_length": 1859.5625, | |
| "completions/mean_terminated_length": 1859.5625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.42384539544582367, | |
| "epoch": 0.00096, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.1793887615203857, | |
| "kl": 0.004719441349152476, | |
| "learning_rate": 1.8857142857142858e-06, | |
| "loss": -0.0501, | |
| "num_tokens": 930647.0, | |
| "reward": 0.5653125047683716, | |
| "reward_std": 0.09132834523916245, | |
| "rewards/rollout_reward_func/mean": 0.5653125047683716, | |
| "rewards/rollout_reward_func/std": 0.4122869372367859, | |
| "sampling/importance_sampling_ratio/max": 1.968488097190857, | |
| "sampling/importance_sampling_ratio/mean": 1.1238960027694702, | |
| "sampling/importance_sampling_ratio/min": 0.5891481637954712, | |
| "sampling/sampling_logp_difference/max": 0.9189000129699707, | |
| "sampling/sampling_logp_difference/mean": 0.045362215489149094, | |
| "step": 12, | |
| "step_time": 12.105040577999944 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03163956617936492, | |
| "clip_ratio/high_mean": 0.009185401839204133, | |
| "clip_ratio/low_mean": 0.01267810445278883, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021863506408408284, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2794.0, | |
| "completions/max_terminated_length": 2794.0, | |
| "completions/mean_length": 1744.0625, | |
| "completions/mean_terminated_length": 1744.0625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.38077671080827713, | |
| "epoch": 0.00104, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.601933240890503, | |
| "kl": 0.0043588640401139855, | |
| "learning_rate": 2.0571428571428573e-06, | |
| "loss": -0.0271, | |
| "num_tokens": 998268.0, | |
| "reward": 0.5040624737739563, | |
| "reward_std": 0.3648141622543335, | |
| "rewards/rollout_reward_func/mean": 0.5040624737739563, | |
| "rewards/rollout_reward_func/std": 0.4420104920864105, | |
| "sampling/importance_sampling_ratio/max": 2.2825241088867188, | |
| "sampling/importance_sampling_ratio/mean": 1.0028969049453735, | |
| "sampling/importance_sampling_ratio/min": 0.37051475048065186, | |
| "sampling/sampling_logp_difference/max": 0.6929263472557068, | |
| "sampling/sampling_logp_difference/mean": 0.043037254363298416, | |
| "step": 13, | |
| "step_time": 12.434848872999964 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0682385629042983, | |
| "clip_ratio/high_mean": 0.022985405288636684, | |
| "clip_ratio/low_mean": 0.0055555556900799274, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.028540961910039186, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2797.0, | |
| "completions/max_terminated_length": 2797.0, | |
| "completions/mean_length": 2003.40625, | |
| "completions/mean_terminated_length": 2003.40625, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.40549617260694504, | |
| "epoch": 0.00112, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 2.6730706691741943, | |
| "kl": 0.004465080099180341, | |
| "learning_rate": 2.2285714285714286e-06, | |
| "loss": 0.0367, | |
| "num_tokens": 1075200.0, | |
| "reward": 0.3971875011920929, | |
| "reward_std": 0.24656714498996735, | |
| "rewards/rollout_reward_func/mean": 0.3971875011920929, | |
| "rewards/rollout_reward_func/std": 0.3921506702899933, | |
| "sampling/importance_sampling_ratio/max": 2.08994197845459, | |
| "sampling/importance_sampling_ratio/mean": 0.9472236037254333, | |
| "sampling/importance_sampling_ratio/min": 0.2920815646648407, | |
| "sampling/sampling_logp_difference/max": 0.5747750997543335, | |
| "sampling/sampling_logp_difference/mean": 0.0427585169672966, | |
| "step": 14, | |
| "step_time": 13.12732943400033 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04484127042815089, | |
| "clip_ratio/high_mean": 0.01285505446139723, | |
| "clip_ratio/low_mean": 0.008134920848533511, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.020989975426346064, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/max_terminated_length": 2801.0, | |
| "completions/mean_length": 1879.96875, | |
| "completions/mean_terminated_length": 1879.96875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3704817444086075, | |
| "epoch": 0.0012, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.3741340637207031, | |
| "kl": 0.004930144699756056, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": -0.036, | |
| "num_tokens": 1147654.0, | |
| "reward": 0.5634374618530273, | |
| "reward_std": 0.2032102644443512, | |
| "rewards/rollout_reward_func/mean": 0.5634374618530273, | |
| "rewards/rollout_reward_func/std": 0.4479026794433594, | |
| "sampling/importance_sampling_ratio/max": 1.419919490814209, | |
| "sampling/importance_sampling_ratio/mean": 0.8213506937026978, | |
| "sampling/importance_sampling_ratio/min": 0.2297196239233017, | |
| "sampling/sampling_logp_difference/max": 0.9635820388793945, | |
| "sampling/sampling_logp_difference/mean": 0.04450097680091858, | |
| "step": 15, | |
| "step_time": 12.787183091000088 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.040403091348707676, | |
| "clip_ratio/high_mean": 0.019732415094040334, | |
| "clip_ratio/low_mean": 0.011093285749666393, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.030825700610876083, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2812.0, | |
| "completions/max_terminated_length": 2812.0, | |
| "completions/mean_length": 2226.75, | |
| "completions/mean_terminated_length": 2226.75, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.4390428438782692, | |
| "epoch": 0.00128, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 3.3430113792419434, | |
| "kl": 0.005434123100712895, | |
| "learning_rate": 2.571428571428571e-06, | |
| "loss": 0.0608, | |
| "num_tokens": 1232479.0, | |
| "reward": 0.35874998569488525, | |
| "reward_std": 0.16885429620742798, | |
| "rewards/rollout_reward_func/mean": 0.35874998569488525, | |
| "rewards/rollout_reward_func/std": 0.31368517875671387, | |
| "sampling/importance_sampling_ratio/max": 2.1880178451538086, | |
| "sampling/importance_sampling_ratio/mean": 0.9618589878082275, | |
| "sampling/importance_sampling_ratio/min": 0.12961336970329285, | |
| "sampling/sampling_logp_difference/max": 0.941362738609314, | |
| "sampling/sampling_logp_difference/mean": 0.05188923329114914, | |
| "step": 16, | |
| "step_time": 13.173405885999728 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04390919208526611, | |
| "clip_ratio/high_mean": 0.017439239425584674, | |
| "clip_ratio/low_mean": 0.007801226573064923, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025240465998649597, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2434.0, | |
| "completions/max_terminated_length": 2434.0, | |
| "completions/mean_length": 1750.0625, | |
| "completions/mean_terminated_length": 1750.0625, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3980557546019554, | |
| "epoch": 0.00136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.602077007293701, | |
| "kl": 0.004334585275501013, | |
| "learning_rate": 2.742857142857143e-06, | |
| "loss": -0.0264, | |
| "num_tokens": 1300635.0, | |
| "reward": 0.38499999046325684, | |
| "reward_std": 0.221183180809021, | |
| "rewards/rollout_reward_func/mean": 0.38499999046325684, | |
| "rewards/rollout_reward_func/std": 0.34839722514152527, | |
| "sampling/importance_sampling_ratio/max": 1.7235372066497803, | |
| "sampling/importance_sampling_ratio/mean": 0.9467421770095825, | |
| "sampling/importance_sampling_ratio/min": 0.2654297649860382, | |
| "sampling/sampling_logp_difference/max": 0.7773740887641907, | |
| "sampling/sampling_logp_difference/mean": 0.04712219163775444, | |
| "step": 17, | |
| "step_time": 11.411276259999795 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04594441968947649, | |
| "clip_ratio/high_mean": 0.013718247646465898, | |
| "clip_ratio/low_mean": 0.004949534311890602, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.018667781492695212, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2798.0, | |
| "completions/max_terminated_length": 2798.0, | |
| "completions/mean_length": 2309.09375, | |
| "completions/mean_terminated_length": 2309.09375, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.4307108670473099, | |
| "epoch": 0.00144, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.9505234956741333, | |
| "kl": 0.004669323505368084, | |
| "learning_rate": 2.9142857142857142e-06, | |
| "loss": 0.0981, | |
| "num_tokens": 1388529.0, | |
| "reward": 0.3696874976158142, | |
| "reward_std": 0.155008003115654, | |
| "rewards/rollout_reward_func/mean": 0.3696874976158142, | |
| "rewards/rollout_reward_func/std": 0.28414538502693176, | |
| "sampling/importance_sampling_ratio/max": 1.8336728811264038, | |
| "sampling/importance_sampling_ratio/mean": 0.9352109432220459, | |
| "sampling/importance_sampling_ratio/min": 0.28059616684913635, | |
| "sampling/sampling_logp_difference/max": 1.0694303512573242, | |
| "sampling/sampling_logp_difference/mean": 0.05270082503557205, | |
| "step": 18, | |
| "step_time": 13.463537552000162 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03351574344560504, | |
| "clip_ratio/high_mean": 0.017963151913136244, | |
| "clip_ratio/low_mean": 0.005672972998581827, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023636124562472105, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2782.0, | |
| "completions/max_terminated_length": 2782.0, | |
| "completions/mean_length": 2203.90625, | |
| "completions/mean_terminated_length": 2203.90625, | |
| "completions/min_length": 1564.0, | |
| "completions/min_terminated_length": 1564.0, | |
| "entropy": 0.4163732975721359, | |
| "epoch": 0.00152, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.54580020904541, | |
| "kl": 0.0039027896127663553, | |
| "learning_rate": 3.0857142857142855e-06, | |
| "loss": -0.0385, | |
| "num_tokens": 1472480.0, | |
| "reward": 0.2887499928474426, | |
| "reward_std": 0.1067335307598114, | |
| "rewards/rollout_reward_func/mean": 0.2887499928474426, | |
| "rewards/rollout_reward_func/std": 0.17496080696582794, | |
| "sampling/importance_sampling_ratio/max": 2.46917724609375, | |
| "sampling/importance_sampling_ratio/mean": 1.0520013570785522, | |
| "sampling/importance_sampling_ratio/min": 0.31319668889045715, | |
| "sampling/sampling_logp_difference/max": 0.6751515865325928, | |
| "sampling/sampling_logp_difference/mean": 0.04795370250940323, | |
| "step": 19, | |
| "step_time": 12.881922476999762 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03289473615586758, | |
| "clip_ratio/high_mean": 0.016854635905474424, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.016854635905474424, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2803.0, | |
| "completions/max_terminated_length": 2803.0, | |
| "completions/mean_length": 1839.5625, | |
| "completions/mean_terminated_length": 1839.5625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.3896697387099266, | |
| "epoch": 0.0016, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 3.0274949073791504, | |
| "kl": 0.004332752665504813, | |
| "learning_rate": 3.257142857142857e-06, | |
| "loss": 0.1087, | |
| "num_tokens": 1543546.0, | |
| "reward": 0.4571874737739563, | |
| "reward_std": 0.20620864629745483, | |
| "rewards/rollout_reward_func/mean": 0.4571874737739563, | |
| "rewards/rollout_reward_func/std": 0.38446637988090515, | |
| "sampling/importance_sampling_ratio/max": 2.2497854232788086, | |
| "sampling/importance_sampling_ratio/mean": 0.9864073395729065, | |
| "sampling/importance_sampling_ratio/min": 0.3370327055454254, | |
| "sampling/sampling_logp_difference/max": 0.9195313453674316, | |
| "sampling/sampling_logp_difference/mean": 0.04598519578576088, | |
| "step": 20, | |
| "step_time": 13.28698261799991 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.047807968221604824, | |
| "clip_ratio/high_mean": 0.017668311716988683, | |
| "clip_ratio/low_mean": 0.0087070451118052, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026375357527285814, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2784.0, | |
| "completions/max_terminated_length": 2784.0, | |
| "completions/mean_length": 1916.84375, | |
| "completions/mean_terminated_length": 1916.84375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3857065215706825, | |
| "epoch": 0.00168, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 2.2928388118743896, | |
| "kl": 0.0030007859459146857, | |
| "learning_rate": 3.4285714285714285e-06, | |
| "loss": 0.0691, | |
| "num_tokens": 1617299.0, | |
| "reward": 0.5199999809265137, | |
| "reward_std": 0.24810142815113068, | |
| "rewards/rollout_reward_func/mean": 0.5199999809265137, | |
| "rewards/rollout_reward_func/std": 0.44394853711128235, | |
| "sampling/importance_sampling_ratio/max": 1.9692103862762451, | |
| "sampling/importance_sampling_ratio/mean": 1.0206944942474365, | |
| "sampling/importance_sampling_ratio/min": 0.37676262855529785, | |
| "sampling/sampling_logp_difference/max": 0.5263509750366211, | |
| "sampling/sampling_logp_difference/mean": 0.04122690111398697, | |
| "step": 21, | |
| "step_time": 12.188486421000334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.040458154398947954, | |
| "clip_ratio/high_mean": 0.011364538804627955, | |
| "clip_ratio/low_mean": 0.006526540499180555, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.017891079653054476, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/max_terminated_length": 2801.0, | |
| "completions/mean_length": 1878.4375, | |
| "completions/mean_terminated_length": 1878.4375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3931129276752472, | |
| "epoch": 0.00176, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.7913806438446045, | |
| "kl": 0.004387594643048942, | |
| "learning_rate": 3.6e-06, | |
| "loss": -0.0657, | |
| "num_tokens": 1690165.0, | |
| "reward": 0.48281246423721313, | |
| "reward_std": 0.24489575624465942, | |
| "rewards/rollout_reward_func/mean": 0.48281246423721313, | |
| "rewards/rollout_reward_func/std": 0.42833685874938965, | |
| "sampling/importance_sampling_ratio/max": 2.001044750213623, | |
| "sampling/importance_sampling_ratio/mean": 0.8716533780097961, | |
| "sampling/importance_sampling_ratio/min": 0.21946659684181213, | |
| "sampling/sampling_logp_difference/max": 0.6549723148345947, | |
| "sampling/sampling_logp_difference/mean": 0.04290828853845596, | |
| "step": 22, | |
| "step_time": 12.900562208999872 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012908496893942356, | |
| "clip_ratio/high_mean": 0.003227124223485589, | |
| "clip_ratio/low_mean": 0.0013888889225199819, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004616013146005571, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2795.0, | |
| "completions/max_terminated_length": 2795.0, | |
| "completions/mean_length": 1921.96875, | |
| "completions/mean_terminated_length": 1921.96875, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.3991682603955269, | |
| "epoch": 0.00184, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.757072687149048, | |
| "kl": 0.002871026110369712, | |
| "learning_rate": 3.7714285714285716e-06, | |
| "loss": -0.0322, | |
| "num_tokens": 1764351.0, | |
| "reward": 0.4725000262260437, | |
| "reward_std": 0.15098075568675995, | |
| "rewards/rollout_reward_func/mean": 0.4725000262260437, | |
| "rewards/rollout_reward_func/std": 0.3937331438064575, | |
| "sampling/importance_sampling_ratio/max": 2.473691463470459, | |
| "sampling/importance_sampling_ratio/mean": 1.0277502536773682, | |
| "sampling/importance_sampling_ratio/min": 0.3683130145072937, | |
| "sampling/sampling_logp_difference/max": 0.8325839042663574, | |
| "sampling/sampling_logp_difference/mean": 0.041013769805431366, | |
| "step": 23, | |
| "step_time": 12.923486550999996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022044573910534382, | |
| "clip_ratio/high_mean": 0.0072472544852644205, | |
| "clip_ratio/low_mean": 0.007787698996253312, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015034952783025801, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2791.0, | |
| "completions/max_terminated_length": 2791.0, | |
| "completions/mean_length": 2014.1875, | |
| "completions/mean_terminated_length": 2014.1875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.39251676946878433, | |
| "epoch": 0.00192, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.9654884338378906, | |
| "kl": 0.0081728242803365, | |
| "learning_rate": 3.942857142857143e-06, | |
| "loss": -0.0383, | |
| "num_tokens": 1841628.0, | |
| "reward": 0.35874998569488525, | |
| "reward_std": 0.21719886362552643, | |
| "rewards/rollout_reward_func/mean": 0.35874998569488525, | |
| "rewards/rollout_reward_func/std": 0.31252095103263855, | |
| "sampling/importance_sampling_ratio/max": 2.0834484100341797, | |
| "sampling/importance_sampling_ratio/mean": 0.9893499612808228, | |
| "sampling/importance_sampling_ratio/min": 0.06596492230892181, | |
| "sampling/sampling_logp_difference/max": 1.764291524887085, | |
| "sampling/sampling_logp_difference/mean": 0.05037356913089752, | |
| "step": 24, | |
| "step_time": 12.55188547500029 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03194444486871362, | |
| "clip_ratio/high_mean": 0.009474206599406898, | |
| "clip_ratio/low_mean": 0.004620927385985851, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.014095134101808071, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2432.0, | |
| "completions/max_terminated_length": 2432.0, | |
| "completions/mean_length": 1997.46875, | |
| "completions/mean_terminated_length": 1997.46875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.40253835916519165, | |
| "epoch": 0.002, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 2.1649582386016846, | |
| "kl": 0.007934511464554816, | |
| "learning_rate": 4.114285714285715e-06, | |
| "loss": -0.084, | |
| "num_tokens": 1918276.0, | |
| "reward": 0.3425000011920929, | |
| "reward_std": 0.16030071675777435, | |
| "rewards/rollout_reward_func/mean": 0.3425000011920929, | |
| "rewards/rollout_reward_func/std": 0.27845191955566406, | |
| "sampling/importance_sampling_ratio/max": 1.7379083633422852, | |
| "sampling/importance_sampling_ratio/mean": 1.0123233795166016, | |
| "sampling/importance_sampling_ratio/min": 0.21978217363357544, | |
| "sampling/sampling_logp_difference/max": 0.9820888042449951, | |
| "sampling/sampling_logp_difference/mean": 0.043975915759801865, | |
| "step": 25, | |
| "step_time": 11.6416912709999 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.057189542800188065, | |
| "clip_ratio/high_mean": 0.02329625654965639, | |
| "clip_ratio/low_mean": 0.008795286994427443, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03209154261276126, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2812.0, | |
| "completions/max_terminated_length": 2812.0, | |
| "completions/mean_length": 2010.0625, | |
| "completions/mean_terminated_length": 2010.0625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.3653796315193176, | |
| "epoch": 0.00208, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 2.1230344772338867, | |
| "kl": 0.006589570315554738, | |
| "learning_rate": 4.285714285714286e-06, | |
| "loss": -0.0197, | |
| "num_tokens": 1995372.0, | |
| "reward": 0.4256249964237213, | |
| "reward_std": 0.23703671991825104, | |
| "rewards/rollout_reward_func/mean": 0.4256249964237213, | |
| "rewards/rollout_reward_func/std": 0.3602412939071655, | |
| "sampling/importance_sampling_ratio/max": 1.7632914781570435, | |
| "sampling/importance_sampling_ratio/mean": 0.9213794469833374, | |
| "sampling/importance_sampling_ratio/min": 0.4378761649131775, | |
| "sampling/sampling_logp_difference/max": 0.56688392162323, | |
| "sampling/sampling_logp_difference/mean": 0.03944293037056923, | |
| "step": 26, | |
| "step_time": 13.15109654299954 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04949874710291624, | |
| "clip_ratio/high_mean": 0.02149919094517827, | |
| "clip_ratio/low_mean": 0.00766741088591516, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02916660183109343, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2785.0, | |
| "completions/max_terminated_length": 2785.0, | |
| "completions/mean_length": 1842.0625, | |
| "completions/mean_terminated_length": 1842.0625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.4269302785396576, | |
| "epoch": 0.00216, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.6013935804367065, | |
| "kl": 0.00617267657071352, | |
| "learning_rate": 4.457142857142857e-06, | |
| "loss": -0.0345, | |
| "num_tokens": 2066465.0, | |
| "reward": 0.5221875309944153, | |
| "reward_std": 0.22779378294944763, | |
| "rewards/rollout_reward_func/mean": 0.5221875309944153, | |
| "rewards/rollout_reward_func/std": 0.4334239661693573, | |
| "sampling/importance_sampling_ratio/max": 2.312187433242798, | |
| "sampling/importance_sampling_ratio/mean": 0.8621585369110107, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9948511123657227, | |
| "sampling/sampling_logp_difference/mean": 0.051924653351306915, | |
| "step": 27, | |
| "step_time": 12.681567872999949 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04371212236583233, | |
| "clip_ratio/high_mean": 0.0183574166148901, | |
| "clip_ratio/low_mean": 0.005908275721594691, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.024265691870823503, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2796.0, | |
| "completions/max_terminated_length": 2796.0, | |
| "completions/mean_length": 2155.5, | |
| "completions/mean_terminated_length": 2155.5, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.41429970413446426, | |
| "epoch": 0.00224, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.647275447845459, | |
| "kl": 0.010079714236781001, | |
| "learning_rate": 4.628571428571429e-06, | |
| "loss": -0.0864, | |
| "num_tokens": 2148817.0, | |
| "reward": 0.3021875023841858, | |
| "reward_std": 0.11279378086328506, | |
| "rewards/rollout_reward_func/mean": 0.3021875023841858, | |
| "rewards/rollout_reward_func/std": 0.23064753413200378, | |
| "sampling/importance_sampling_ratio/max": 2.1843345165252686, | |
| "sampling/importance_sampling_ratio/mean": 0.9328470230102539, | |
| "sampling/importance_sampling_ratio/min": 0.11585874110460281, | |
| "sampling/sampling_logp_difference/max": 1.9821176528930664, | |
| "sampling/sampling_logp_difference/mean": 0.05276907980442047, | |
| "step": 28, | |
| "step_time": 12.799536062000016 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.039141415152698755, | |
| "clip_ratio/high_mean": 0.019034530967473984, | |
| "clip_ratio/low_mean": 0.005208333372138441, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02424286410678178, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2411.0, | |
| "completions/max_terminated_length": 2411.0, | |
| "completions/mean_length": 1544.21875, | |
| "completions/mean_terminated_length": 1544.21875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.38873114436864853, | |
| "epoch": 0.00232, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.288419485092163, | |
| "kl": 0.008441059850156307, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": -0.0294, | |
| "num_tokens": 2209518.0, | |
| "reward": 0.5049999952316284, | |
| "reward_std": 0.367961049079895, | |
| "rewards/rollout_reward_func/mean": 0.5049999952316284, | |
| "rewards/rollout_reward_func/std": 0.4586867392063141, | |
| "sampling/importance_sampling_ratio/max": 1.7176055908203125, | |
| "sampling/importance_sampling_ratio/mean": 0.8919655084609985, | |
| "sampling/importance_sampling_ratio/min": 0.3174732029438019, | |
| "sampling/sampling_logp_difference/max": 1.007685899734497, | |
| "sampling/sampling_logp_difference/mean": 0.043198756873607635, | |
| "step": 29, | |
| "step_time": 11.569315259000177 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03119284799322486, | |
| "clip_ratio/high_mean": 0.009251700364984572, | |
| "clip_ratio/low_mean": 0.0032051282469183207, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.012456828728318214, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2776.0, | |
| "completions/max_terminated_length": 2776.0, | |
| "completions/mean_length": 1695.40625, | |
| "completions/mean_terminated_length": 1695.40625, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.38929111510515213, | |
| "epoch": 0.0024, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.749756932258606, | |
| "kl": 0.01017191493883729, | |
| "learning_rate": 4.9714285714285715e-06, | |
| "loss": 0.0146, | |
| "num_tokens": 2275561.0, | |
| "reward": 0.5309374928474426, | |
| "reward_std": 0.32216140627861023, | |
| "rewards/rollout_reward_func/mean": 0.5309374928474426, | |
| "rewards/rollout_reward_func/std": 0.4390852451324463, | |
| "sampling/importance_sampling_ratio/max": 2.9540531635284424, | |
| "sampling/importance_sampling_ratio/mean": 1.0208276510238647, | |
| "sampling/importance_sampling_ratio/min": 0.37041175365448, | |
| "sampling/sampling_logp_difference/max": 0.5885751247406006, | |
| "sampling/sampling_logp_difference/mean": 0.04683335870504379, | |
| "step": 30, | |
| "step_time": 12.191692169999897 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05563905602321029, | |
| "clip_ratio/high_mean": 0.01747169380541891, | |
| "clip_ratio/low_mean": 0.008184524020180106, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02565621805842966, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2783.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 1801.09375, | |
| "completions/mean_terminated_length": 1801.09375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.3914438411593437, | |
| "epoch": 0.00248, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 2.8585875034332275, | |
| "kl": 0.015274998731911182, | |
| "learning_rate": 5.142857142857142e-06, | |
| "loss": 0.0419, | |
| "num_tokens": 2345322.0, | |
| "reward": 0.36281251907348633, | |
| "reward_std": 0.2801453769207001, | |
| "rewards/rollout_reward_func/mean": 0.36281251907348633, | |
| "rewards/rollout_reward_func/std": 0.342911958694458, | |
| "sampling/importance_sampling_ratio/max": 2.163181781768799, | |
| "sampling/importance_sampling_ratio/mean": 0.9487945437431335, | |
| "sampling/importance_sampling_ratio/min": 0.29707521200180054, | |
| "sampling/sampling_logp_difference/max": 0.7824678421020508, | |
| "sampling/sampling_logp_difference/mean": 0.0532098188996315, | |
| "step": 31, | |
| "step_time": 13.19187305000014 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03187447274103761, | |
| "clip_ratio/high_mean": 0.018647319404408336, | |
| "clip_ratio/low_mean": 0.004727297928184271, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02337461756542325, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2435.0, | |
| "completions/max_terminated_length": 2435.0, | |
| "completions/mean_length": 1984.90625, | |
| "completions/mean_terminated_length": 1984.90625, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.416415698826313, | |
| "epoch": 0.00256, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.3030495643615723, | |
| "kl": 0.015865659108385444, | |
| "learning_rate": 5.314285714285714e-06, | |
| "loss": -0.0567, | |
| "num_tokens": 2421421.0, | |
| "reward": 0.3878124952316284, | |
| "reward_std": 0.23157384991645813, | |
| "rewards/rollout_reward_func/mean": 0.3878124952316284, | |
| "rewards/rollout_reward_func/std": 0.3412286341190338, | |
| "sampling/importance_sampling_ratio/max": 2.5926010608673096, | |
| "sampling/importance_sampling_ratio/mean": 0.9760158658027649, | |
| "sampling/importance_sampling_ratio/min": 0.2061164528131485, | |
| "sampling/sampling_logp_difference/max": 0.8063008785247803, | |
| "sampling/sampling_logp_difference/mean": 0.04909588024020195, | |
| "step": 32, | |
| "step_time": 11.466194520999807 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.019717262126505375, | |
| "clip_ratio/high_mean": 0.004929315531626344, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004929315531626344, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2809.0, | |
| "completions/max_terminated_length": 2809.0, | |
| "completions/mean_length": 2102.71875, | |
| "completions/mean_terminated_length": 2102.71875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.42558059841394424, | |
| "epoch": 0.00264, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.5914040803909302, | |
| "kl": 0.010543531039729714, | |
| "learning_rate": 5.485714285714286e-06, | |
| "loss": 0.0448, | |
| "num_tokens": 2501867.0, | |
| "reward": 0.5221875309944153, | |
| "reward_std": 0.14279377460479736, | |
| "rewards/rollout_reward_func/mean": 0.5221875309944153, | |
| "rewards/rollout_reward_func/std": 0.4007873833179474, | |
| "sampling/importance_sampling_ratio/max": 1.5994207859039307, | |
| "sampling/importance_sampling_ratio/mean": 0.8397550582885742, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9267706871032715, | |
| "sampling/sampling_logp_difference/mean": 0.0471554696559906, | |
| "step": 33, | |
| "step_time": 12.975996798000097 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.040178571827709675, | |
| "clip_ratio/high_mean": 0.016144166933372617, | |
| "clip_ratio/low_mean": 0.005662594106979668, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021806761040352285, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2441.0, | |
| "completions/max_terminated_length": 2441.0, | |
| "completions/mean_length": 1488.4375, | |
| "completions/mean_terminated_length": 1488.4375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.35695891827344894, | |
| "epoch": 0.00272, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.6733559370040894, | |
| "kl": 0.020034206565469503, | |
| "learning_rate": 5.6571428571428576e-06, | |
| "loss": -0.0588, | |
| "num_tokens": 2560884.0, | |
| "reward": 0.5859375, | |
| "reward_std": 0.38607701659202576, | |
| "rewards/rollout_reward_func/mean": 0.5859375, | |
| "rewards/rollout_reward_func/std": 0.45654281973838806, | |
| "sampling/importance_sampling_ratio/max": 1.8220971822738647, | |
| "sampling/importance_sampling_ratio/mean": 0.9860107898712158, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9601047039031982, | |
| "sampling/sampling_logp_difference/mean": 0.052328821271657944, | |
| "step": 34, | |
| "step_time": 10.76481853400037 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00657894741743803, | |
| "clip_ratio/high_mean": 0.003289473708719015, | |
| "clip_ratio/low_mean": 0.008878070977516472, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.012167544686235487, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/max_terminated_length": 2801.0, | |
| "completions/mean_length": 1756.46875, | |
| "completions/mean_terminated_length": 1756.46875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.38564804941415787, | |
| "epoch": 0.0028, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.5950710773468018, | |
| "kl": 0.0196278584189713, | |
| "learning_rate": 5.8285714285714284e-06, | |
| "loss": 0.0794, | |
| "num_tokens": 2629098.0, | |
| "reward": 0.4750000238418579, | |
| "reward_std": 0.26933756470680237, | |
| "rewards/rollout_reward_func/mean": 0.4750000238418579, | |
| "rewards/rollout_reward_func/std": 0.40420371294021606, | |
| "sampling/importance_sampling_ratio/max": 2.8944315910339355, | |
| "sampling/importance_sampling_ratio/mean": 1.212613582611084, | |
| "sampling/importance_sampling_ratio/min": 0.3920697867870331, | |
| "sampling/sampling_logp_difference/max": 0.7614344358444214, | |
| "sampling/sampling_logp_difference/mean": 0.050811417400836945, | |
| "step": 35, | |
| "step_time": 12.117880444999855 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.032855731435120106, | |
| "clip_ratio/high_mean": 0.008213932858780026, | |
| "clip_ratio/low_mean": 0.008068988332524896, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.016282920725643635, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2819.0, | |
| "completions/max_terminated_length": 2819.0, | |
| "completions/mean_length": 2214.375, | |
| "completions/mean_terminated_length": 2214.375, | |
| "completions/min_length": 1579.0, | |
| "completions/min_terminated_length": 1579.0, | |
| "entropy": 0.4132639244198799, | |
| "epoch": 0.00288, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.4248710870742798, | |
| "kl": 0.04949819762259722, | |
| "learning_rate": 6e-06, | |
| "loss": -0.1152, | |
| "num_tokens": 2713433.0, | |
| "reward": 0.3043749928474426, | |
| "reward_std": 0.08011817932128906, | |
| "rewards/rollout_reward_func/mean": 0.3043749928474426, | |
| "rewards/rollout_reward_func/std": 0.16871310770511627, | |
| "sampling/importance_sampling_ratio/max": 2.279515504837036, | |
| "sampling/importance_sampling_ratio/mean": 1.0208816528320312, | |
| "sampling/importance_sampling_ratio/min": 0.2197788804769516, | |
| "sampling/sampling_logp_difference/max": 1.5309280157089233, | |
| "sampling/sampling_logp_difference/mean": 0.05491582304239273, | |
| "step": 36, | |
| "step_time": 13.165009270000155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01785714365541935, | |
| "clip_ratio/high_mean": 0.004464285913854837, | |
| "clip_ratio/low_mean": 0.0022321429569274187, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006696428870782256, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2803.0, | |
| "completions/max_terminated_length": 2803.0, | |
| "completions/mean_length": 1736.1875, | |
| "completions/mean_terminated_length": 1736.1875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3515569269657135, | |
| "epoch": 0.00296, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.0670298337936401, | |
| "kl": 0.025617226026952267, | |
| "learning_rate": 5.999999982184864e-06, | |
| "loss": 0.0221, | |
| "num_tokens": 2780777.0, | |
| "reward": 0.4387500286102295, | |
| "reward_std": 0.25966876745224, | |
| "rewards/rollout_reward_func/mean": 0.4387500286102295, | |
| "rewards/rollout_reward_func/std": 0.3832606077194214, | |
| "sampling/importance_sampling_ratio/max": 2.3271644115448, | |
| "sampling/importance_sampling_ratio/mean": 1.0649113655090332, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0678925514221191, | |
| "sampling/sampling_logp_difference/mean": 0.05666026473045349, | |
| "step": 37, | |
| "step_time": 12.593250806000015 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.028383397962898016, | |
| "clip_ratio/high_mean": 0.010161041049286723, | |
| "clip_ratio/low_mean": 0.006483843666501343, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.016644884599372745, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2777.0, | |
| "completions/max_terminated_length": 2777.0, | |
| "completions/mean_length": 1819.5625, | |
| "completions/mean_terminated_length": 1819.5625, | |
| "completions/min_length": 1056.0, | |
| "completions/min_terminated_length": 1056.0, | |
| "entropy": 0.38034912198781967, | |
| "epoch": 0.00304, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.0448880195617676, | |
| "kl": 0.04296189732849598, | |
| "learning_rate": 5.999999928739459e-06, | |
| "loss": -0.0115, | |
| "num_tokens": 2851032.0, | |
| "reward": 0.6024999618530273, | |
| "reward_std": 0.2617889940738678, | |
| "rewards/rollout_reward_func/mean": 0.6024999618530273, | |
| "rewards/rollout_reward_func/std": 0.44098126888275146, | |
| "sampling/importance_sampling_ratio/max": 2.681164503097534, | |
| "sampling/importance_sampling_ratio/mean": 1.0418896675109863, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.4294462203979492, | |
| "sampling/sampling_logp_difference/mean": 0.0609976202249527, | |
| "step": 38, | |
| "step_time": 12.55964067500031 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.047167123295366764, | |
| "clip_ratio/high_mean": 0.014736625598743558, | |
| "clip_ratio/low_mean": 0.004429678898304701, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01916630449704826, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2820.0, | |
| "completions/max_terminated_length": 2820.0, | |
| "completions/mean_length": 2000.0, | |
| "completions/mean_terminated_length": 2000.0, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.4035666435956955, | |
| "epoch": 0.00312, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.904247760772705, | |
| "kl": 0.03608058113604784, | |
| "learning_rate": 5.999999839663784e-06, | |
| "loss": -0.1975, | |
| "num_tokens": 2927712.0, | |
| "reward": 0.3853124976158142, | |
| "reward_std": 0.1657649129629135, | |
| "rewards/rollout_reward_func/mean": 0.3853124976158142, | |
| "rewards/rollout_reward_func/std": 0.31012988090515137, | |
| "sampling/importance_sampling_ratio/max": 2.3516104221343994, | |
| "sampling/importance_sampling_ratio/mean": 0.8599222898483276, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.4187037944793701, | |
| "sampling/sampling_logp_difference/mean": 0.05978023633360863, | |
| "step": 39, | |
| "step_time": 12.440508590000036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04069459065794945, | |
| "clip_ratio/high_mean": 0.017941734986379743, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01958647184073925, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2423.0, | |
| "completions/max_terminated_length": 2423.0, | |
| "completions/mean_length": 1889.0625, | |
| "completions/mean_terminated_length": 1889.0625, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.42887038737535477, | |
| "epoch": 0.0032, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.507852077484131, | |
| "kl": 0.031137569807469845, | |
| "learning_rate": 5.99999971495784e-06, | |
| "loss": -0.0375, | |
| "num_tokens": 3000212.0, | |
| "reward": 0.38593751192092896, | |
| "reward_std": 0.16842570900917053, | |
| "rewards/rollout_reward_func/mean": 0.38593751192092896, | |
| "rewards/rollout_reward_func/std": 0.35313212871551514, | |
| "sampling/importance_sampling_ratio/max": 1.8619109392166138, | |
| "sampling/importance_sampling_ratio/mean": 0.8876512050628662, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8854889869689941, | |
| "sampling/sampling_logp_difference/mean": 0.0671561062335968, | |
| "step": 40, | |
| "step_time": 11.693177195999851 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02651259582489729, | |
| "clip_ratio/high_mean": 0.006628148956224322, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00836426008027047, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2789.0, | |
| "completions/max_terminated_length": 2789.0, | |
| "completions/mean_length": 2136.03125, | |
| "completions/mean_terminated_length": 2136.03125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.42095063626766205, | |
| "epoch": 0.00328, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.2850134372711182, | |
| "kl": 0.039208856876939535, | |
| "learning_rate": 5.99999955462163e-06, | |
| "loss": -0.0237, | |
| "num_tokens": 3081651.0, | |
| "reward": 0.3506249785423279, | |
| "reward_std": 0.1440507173538208, | |
| "rewards/rollout_reward_func/mean": 0.3506249785423279, | |
| "rewards/rollout_reward_func/std": 0.2683153748512268, | |
| "sampling/importance_sampling_ratio/max": 2.8166987895965576, | |
| "sampling/importance_sampling_ratio/mean": 1.0108704566955566, | |
| "sampling/importance_sampling_ratio/min": 0.14420194923877716, | |
| "sampling/sampling_logp_difference/max": 1.127936840057373, | |
| "sampling/sampling_logp_difference/mean": 0.06519916653633118, | |
| "step": 41, | |
| "step_time": 14.135176596000292 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03996024373918772, | |
| "clip_ratio/high_mean": 0.012911256635561585, | |
| "clip_ratio/low_mean": 0.004817708395421505, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01772896503098309, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2767.0, | |
| "completions/max_terminated_length": 2767.0, | |
| "completions/mean_length": 1934.65625, | |
| "completions/mean_terminated_length": 1934.65625, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.38335342705249786, | |
| "epoch": 0.00336, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.1722676753997803, | |
| "kl": 0.13585597835481167, | |
| "learning_rate": 5.999999358655157e-06, | |
| "loss": -0.2418, | |
| "num_tokens": 3156023.0, | |
| "reward": 0.3475000262260437, | |
| "reward_std": 0.21655070781707764, | |
| "rewards/rollout_reward_func/mean": 0.3475000262260437, | |
| "rewards/rollout_reward_func/std": 0.3131937086582184, | |
| "sampling/importance_sampling_ratio/max": 2.6130497455596924, | |
| "sampling/importance_sampling_ratio/mean": 0.8806287050247192, | |
| "sampling/importance_sampling_ratio/min": 0.16678351163864136, | |
| "sampling/sampling_logp_difference/max": 2.3499860763549805, | |
| "sampling/sampling_logp_difference/mean": 0.06342820823192596, | |
| "step": 42, | |
| "step_time": 13.112628190999885 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021321472711861134, | |
| "clip_ratio/high_mean": 0.007562511134892702, | |
| "clip_ratio/low_mean": 0.0038768798112869263, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.011439391179010272, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2797.0, | |
| "completions/max_terminated_length": 2797.0, | |
| "completions/mean_length": 1674.3125, | |
| "completions/mean_terminated_length": 1674.3125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.3885280713438988, | |
| "epoch": 0.00344, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.4752204418182373, | |
| "kl": 0.036413189955055714, | |
| "learning_rate": 5.999999127058423e-06, | |
| "loss": 0.0258, | |
| "num_tokens": 3221611.0, | |
| "reward": 0.6737500429153442, | |
| "reward_std": 0.25966876745224, | |
| "rewards/rollout_reward_func/mean": 0.6737500429153442, | |
| "rewards/rollout_reward_func/std": 0.4556862711906433, | |
| "sampling/importance_sampling_ratio/max": 2.9477226734161377, | |
| "sampling/importance_sampling_ratio/mean": 1.1396255493164062, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.167872667312622, | |
| "sampling/sampling_logp_difference/mean": 0.06658157706260681, | |
| "step": 43, | |
| "step_time": 12.020586962000152 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.036011905409395695, | |
| "clip_ratio/high_mean": 0.010423430823720992, | |
| "clip_ratio/low_mean": 0.0030159883899614215, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.013439419795759022, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2814.0, | |
| "completions/max_terminated_length": 2814.0, | |
| "completions/mean_length": 2095.75, | |
| "completions/mean_terminated_length": 2095.75, | |
| "completions/min_length": 1568.0, | |
| "completions/min_terminated_length": 1568.0, | |
| "entropy": 0.39560940861701965, | |
| "epoch": 0.00352, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.8694807291030884, | |
| "kl": 0.1402588039636612, | |
| "learning_rate": 5.999998859831431e-06, | |
| "loss": -0.1597, | |
| "num_tokens": 3301324.0, | |
| "reward": 0.40437501668930054, | |
| "reward_std": 0.2259407639503479, | |
| "rewards/rollout_reward_func/mean": 0.40437501668930054, | |
| "rewards/rollout_reward_func/std": 0.35422733426094055, | |
| "sampling/importance_sampling_ratio/max": 2.6974401473999023, | |
| "sampling/importance_sampling_ratio/mean": 0.8676252365112305, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.911269187927246, | |
| "sampling/sampling_logp_difference/mean": 0.08191373944282532, | |
| "step": 44, | |
| "step_time": 12.868387047999704 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0369886364787817, | |
| "clip_ratio/high_mean": 0.011032873298972845, | |
| "clip_ratio/low_mean": 0.0043535883305594325, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015386461513116956, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2793.0, | |
| "completions/max_terminated_length": 2793.0, | |
| "completions/mean_length": 2412.75, | |
| "completions/mean_terminated_length": 2412.75, | |
| "completions/min_length": 1056.0, | |
| "completions/min_terminated_length": 1056.0, | |
| "entropy": 0.4342958629131317, | |
| "epoch": 0.0036, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.422937035560608, | |
| "kl": 0.11194289568811655, | |
| "learning_rate": 5.999998556974188e-06, | |
| "loss": -0.1586, | |
| "num_tokens": 3392626.0, | |
| "reward": 0.35750001668930054, | |
| "reward_std": 0.0949999988079071, | |
| "rewards/rollout_reward_func/mean": 0.35750001668930054, | |
| "rewards/rollout_reward_func/std": 0.260532945394516, | |
| "sampling/importance_sampling_ratio/max": 2.1776068210601807, | |
| "sampling/importance_sampling_ratio/mean": 0.852668285369873, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.2368037700653076, | |
| "sampling/sampling_logp_difference/mean": 0.07167594134807587, | |
| "step": 45, | |
| "step_time": 13.40532306199998 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.036038962192833424, | |
| "clip_ratio/high_mean": 0.012058520689606667, | |
| "clip_ratio/low_mean": 0.0017857142956927419, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.013844234868884087, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2784.0, | |
| "completions/max_terminated_length": 2784.0, | |
| "completions/mean_length": 2018.40625, | |
| "completions/mean_terminated_length": 2018.40625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.3769753500819206, | |
| "epoch": 0.00368, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 6.942874908447266, | |
| "kl": 0.8322499115020037, | |
| "learning_rate": 5.999998218486697e-06, | |
| "loss": -0.0692, | |
| "num_tokens": 3469989.0, | |
| "reward": 0.39250001311302185, | |
| "reward_std": 0.14825798571109772, | |
| "rewards/rollout_reward_func/mean": 0.39250001311302185, | |
| "rewards/rollout_reward_func/std": 0.29918164014816284, | |
| "sampling/importance_sampling_ratio/max": 2.446554660797119, | |
| "sampling/importance_sampling_ratio/mean": 0.8061342239379883, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.015519142150879, | |
| "sampling/sampling_logp_difference/mean": 0.07696790993213654, | |
| "step": 46, | |
| "step_time": 12.274703390000013 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04237867519259453, | |
| "clip_ratio/high_mean": 0.01807057624682784, | |
| "clip_ratio/low_mean": 0.005178963067010045, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.023249539081007242, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2800.0, | |
| "completions/max_terminated_length": 2800.0, | |
| "completions/mean_length": 1802.75, | |
| "completions/mean_terminated_length": 1802.75, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.38206612318754196, | |
| "epoch": 0.00376, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.6050693988800049, | |
| "kl": 0.05531273875385523, | |
| "learning_rate": 5.999997844368963e-06, | |
| "loss": -0.0113, | |
| "num_tokens": 3540097.0, | |
| "reward": 0.4990624785423279, | |
| "reward_std": 0.28371256589889526, | |
| "rewards/rollout_reward_func/mean": 0.4990624785423279, | |
| "rewards/rollout_reward_func/std": 0.41065138578414917, | |
| "sampling/importance_sampling_ratio/max": 1.9599696397781372, | |
| "sampling/importance_sampling_ratio/mean": 0.8884379863739014, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8157303333282471, | |
| "sampling/sampling_logp_difference/mean": 0.06130218505859375, | |
| "step": 47, | |
| "step_time": 12.33328224800016 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0206808946095407, | |
| "clip_ratio/high_mean": 0.005170223652385175, | |
| "clip_ratio/low_mean": 0.004861111170612276, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01003133482299745, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2813.0, | |
| "completions/max_terminated_length": 2813.0, | |
| "completions/mean_length": 1976.21875, | |
| "completions/mean_terminated_length": 1976.21875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.37976498901844025, | |
| "epoch": 0.00384, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.5906230211257935, | |
| "kl": 0.11688470654189587, | |
| "learning_rate": 5.999997434620992e-06, | |
| "loss": -0.1357, | |
| "num_tokens": 3616089.0, | |
| "reward": 0.437812477350235, | |
| "reward_std": 0.20705953240394592, | |
| "rewards/rollout_reward_func/mean": 0.437812477350235, | |
| "rewards/rollout_reward_func/std": 0.35220715403556824, | |
| "sampling/importance_sampling_ratio/max": 1.8663876056671143, | |
| "sampling/importance_sampling_ratio/mean": 0.8626433610916138, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.65832781791687, | |
| "sampling/sampling_logp_difference/mean": 0.06971758604049683, | |
| "step": 48, | |
| "step_time": 12.541744299999891 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012820512987673283, | |
| "clip_ratio/high_mean": 0.0032051282469183207, | |
| "clip_ratio/low_mean": 0.0014534883666783571, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004658616613596678, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2807.0, | |
| "completions/max_terminated_length": 2807.0, | |
| "completions/mean_length": 2245.15625, | |
| "completions/mean_terminated_length": 2245.15625, | |
| "completions/min_length": 1551.0, | |
| "completions/min_terminated_length": 1551.0, | |
| "entropy": 0.4284479096531868, | |
| "epoch": 0.00392, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.0771631002426147, | |
| "kl": 0.046674114651978016, | |
| "learning_rate": 5.999996989242791e-06, | |
| "loss": -0.0014, | |
| "num_tokens": 3701038.0, | |
| "reward": 0.42624998092651367, | |
| "reward_std": 0.13466876745224, | |
| "rewards/rollout_reward_func/mean": 0.42624998092651367, | |
| "rewards/rollout_reward_func/std": 0.3314265012741089, | |
| "sampling/importance_sampling_ratio/max": 1.4823979139328003, | |
| "sampling/importance_sampling_ratio/mean": 0.8060042858123779, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.11665940284729, | |
| "sampling/sampling_logp_difference/mean": 0.0697537213563919, | |
| "step": 49, | |
| "step_time": 13.132170692999807 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2787.0, | |
| "completions/max_terminated_length": 2787.0, | |
| "completions/mean_length": 2460.71875, | |
| "completions/mean_terminated_length": 2460.71875, | |
| "completions/min_length": 2034.0, | |
| "completions/min_terminated_length": 2034.0, | |
| "entropy": 0.4269709587097168, | |
| "epoch": 0.004, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.06742172688245773, | |
| "kl": 0.05730041675269604, | |
| "learning_rate": 5.999996508234369e-06, | |
| "loss": 0.0008, | |
| "num_tokens": 3793655.0, | |
| "reward": 0.30000001192092896, | |
| "reward_std": 0.0, | |
| "rewards/rollout_reward_func/mean": 0.30000001192092896, | |
| "rewards/rollout_reward_func/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.4013469219207764, | |
| "sampling/importance_sampling_ratio/mean": 0.8128387928009033, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.065826416015625, | |
| "sampling/sampling_logp_difference/mean": 0.07448764890432358, | |
| "step": 50, | |
| "step_time": 13.017520340999681 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03630952490493655, | |
| "clip_ratio/high_mean": 0.012549603707157075, | |
| "clip_ratio/low_mean": 0.0031565657118335366, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015706169069744647, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2785.0, | |
| "completions/max_terminated_length": 2785.0, | |
| "completions/mean_length": 1773.78125, | |
| "completions/mean_terminated_length": 1773.78125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.37685880810022354, | |
| "epoch": 0.00408, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.299412727355957, | |
| "kl": 0.04002719838172197, | |
| "learning_rate": 5.999995991595729e-06, | |
| "loss": -0.0109, | |
| "num_tokens": 3862448.0, | |
| "reward": 0.5353125333786011, | |
| "reward_std": 0.08654377609491348, | |
| "rewards/rollout_reward_func/mean": 0.5353125333786011, | |
| "rewards/rollout_reward_func/std": 0.41608762741088867, | |
| "sampling/importance_sampling_ratio/max": 2.3817224502563477, | |
| "sampling/importance_sampling_ratio/mean": 0.9811595678329468, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0328466892242432, | |
| "sampling/sampling_logp_difference/mean": 0.06913870573043823, | |
| "step": 51, | |
| "step_time": 12.60499654799969 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03819444449618459, | |
| "clip_ratio/high_mean": 0.015144050237722695, | |
| "clip_ratio/low_mean": 0.00554396363440901, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02068801363930106, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2784.0, | |
| "completions/max_terminated_length": 2784.0, | |
| "completions/mean_length": 1786.09375, | |
| "completions/mean_terminated_length": 1786.09375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.38082515448331833, | |
| "epoch": 0.00416, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.5871905088424683, | |
| "kl": 0.06744291074573994, | |
| "learning_rate": 5.999995439326883e-06, | |
| "loss": -0.0699, | |
| "num_tokens": 3931876.0, | |
| "reward": 0.6090624928474426, | |
| "reward_std": 0.26599711179733276, | |
| "rewards/rollout_reward_func/mean": 0.6090624928474426, | |
| "rewards/rollout_reward_func/std": 0.4591953456401825, | |
| "sampling/importance_sampling_ratio/max": 2.734297752380371, | |
| "sampling/importance_sampling_ratio/mean": 0.9665597677230835, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.065897226333618, | |
| "sampling/sampling_logp_difference/mean": 0.06354629993438721, | |
| "step": 52, | |
| "step_time": 13.568785395000077 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022086466662585735, | |
| "clip_ratio/high_mean": 0.008820227812975645, | |
| "clip_ratio/low_mean": 0.0057043652050197124, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01452459313441068, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2789.0, | |
| "completions/max_terminated_length": 2789.0, | |
| "completions/mean_length": 1635.03125, | |
| "completions/mean_terminated_length": 1635.03125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.36858493834733963, | |
| "epoch": 0.00424, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.9566222429275513, | |
| "kl": 0.07718627620488405, | |
| "learning_rate": 5.999994851427837e-06, | |
| "loss": 0.0822, | |
| "num_tokens": 3995868.0, | |
| "reward": 0.6918749809265137, | |
| "reward_std": 0.3203721046447754, | |
| "rewards/rollout_reward_func/mean": 0.6918749809265137, | |
| "rewards/rollout_reward_func/std": 0.4697249233722687, | |
| "sampling/importance_sampling_ratio/max": 2.7838289737701416, | |
| "sampling/importance_sampling_ratio/mean": 0.9136906266212463, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7872750759124756, | |
| "sampling/sampling_logp_difference/mean": 0.07199069857597351, | |
| "step": 53, | |
| "step_time": 12.247058065999909 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.041652148589491844, | |
| "clip_ratio/high_mean": 0.013425522716715932, | |
| "clip_ratio/low_mean": 0.01002952002454549, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0234550426248461, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2784.0, | |
| "completions/max_terminated_length": 2784.0, | |
| "completions/mean_length": 1590.09375, | |
| "completions/mean_terminated_length": 1590.09375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.37869949638843536, | |
| "epoch": 0.00432, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 2.3980886936187744, | |
| "kl": 0.05773049034178257, | |
| "learning_rate": 5.999994227898604e-06, | |
| "loss": -0.0192, | |
| "num_tokens": 4058303.0, | |
| "reward": 0.4609374701976776, | |
| "reward_std": 0.35279375314712524, | |
| "rewards/rollout_reward_func/mean": 0.4609374701976776, | |
| "rewards/rollout_reward_func/std": 0.44058871269226074, | |
| "sampling/importance_sampling_ratio/max": 2.2311129570007324, | |
| "sampling/importance_sampling_ratio/mean": 0.9393452405929565, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9994411468505859, | |
| "sampling/sampling_logp_difference/mean": 0.08165294677019119, | |
| "step": 54, | |
| "step_time": 11.528886767999893 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02447916753590107, | |
| "clip_ratio/high_mean": 0.0075732802506536245, | |
| "clip_ratio/low_mean": 0.00947712454944849, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01705040503293276, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2444.0, | |
| "completions/max_terminated_length": 2444.0, | |
| "completions/mean_length": 1789.75, | |
| "completions/mean_terminated_length": 1789.75, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.36519913375377655, | |
| "epoch": 0.0044, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.2972187995910645, | |
| "kl": 0.05506392475217581, | |
| "learning_rate": 5.99999356873919e-06, | |
| "loss": -0.1185, | |
| "num_tokens": 4127411.0, | |
| "reward": 0.40562498569488525, | |
| "reward_std": 0.22391541302204132, | |
| "rewards/rollout_reward_func/mean": 0.40562498569488525, | |
| "rewards/rollout_reward_func/std": 0.3422500193119049, | |
| "sampling/importance_sampling_ratio/max": 2.4115021228790283, | |
| "sampling/importance_sampling_ratio/mean": 0.9461013674736023, | |
| "sampling/importance_sampling_ratio/min": 0.14957794547080994, | |
| "sampling/sampling_logp_difference/max": 1.0122857093811035, | |
| "sampling/sampling_logp_difference/mean": 0.06259442120790482, | |
| "step": 55, | |
| "step_time": 11.493340414999693 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0438775522634387, | |
| "clip_ratio/high_mean": 0.012457483448088169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.012457483448088169, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2799.0, | |
| "completions/max_terminated_length": 2799.0, | |
| "completions/mean_length": 2306.375, | |
| "completions/mean_terminated_length": 2306.375, | |
| "completions/min_length": 1567.0, | |
| "completions/min_terminated_length": 1567.0, | |
| "entropy": 0.40949854254722595, | |
| "epoch": 0.00448, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.023374319076538, | |
| "kl": 0.08688413165509701, | |
| "learning_rate": 5.999992873949609e-06, | |
| "loss": -0.0712, | |
| "num_tokens": 4214487.0, | |
| "reward": 0.296875, | |
| "reward_std": 0.08874999731779099, | |
| "rewards/rollout_reward_func/mean": 0.296875, | |
| "rewards/rollout_reward_func/std": 0.15228237211704254, | |
| "sampling/importance_sampling_ratio/max": 2.9896316528320312, | |
| "sampling/importance_sampling_ratio/mean": 0.968756377696991, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.835113763809204, | |
| "sampling/sampling_logp_difference/mean": 0.08090537041425705, | |
| "step": 56, | |
| "step_time": 13.222404719000224 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04506416339427233, | |
| "clip_ratio/high_mean": 0.01424223161302507, | |
| "clip_ratio/low_mean": 0.002842377289198339, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01708460901863873, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2441.0, | |
| "completions/max_terminated_length": 2441.0, | |
| "completions/mean_length": 1963.0, | |
| "completions/mean_terminated_length": 1963.0, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.4080217182636261, | |
| "epoch": 0.00456, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.663516640663147, | |
| "kl": 0.3106076046824455, | |
| "learning_rate": 5.999992143529868e-06, | |
| "loss": -0.0796, | |
| "num_tokens": 4289619.0, | |
| "reward": 0.3934375047683716, | |
| "reward_std": 0.1563829779624939, | |
| "rewards/rollout_reward_func/mean": 0.3934375047683716, | |
| "rewards/rollout_reward_func/std": 0.30592650175094604, | |
| "sampling/importance_sampling_ratio/max": 1.4833005666732788, | |
| "sampling/importance_sampling_ratio/mean": 0.5795140862464905, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.2869999408721924, | |
| "sampling/sampling_logp_difference/mean": 0.0979442298412323, | |
| "step": 57, | |
| "step_time": 11.762371722000125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.046875, | |
| "clip_ratio/high_mean": 0.01171875, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01171875, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2802.0, | |
| "completions/max_terminated_length": 2802.0, | |
| "completions/mean_length": 1879.75, | |
| "completions/mean_terminated_length": 1879.75, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.3892976716160774, | |
| "epoch": 0.00464, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.1009422540664673, | |
| "kl": 0.05097049381583929, | |
| "learning_rate": 5.999991377479982e-06, | |
| "loss": -0.0191, | |
| "num_tokens": 4362090.0, | |
| "reward": 0.5262500047683716, | |
| "reward_std": 0.1875, | |
| "rewards/rollout_reward_func/mean": 0.5262500047683716, | |
| "rewards/rollout_reward_func/std": 0.4009806215763092, | |
| "sampling/importance_sampling_ratio/max": 2.9964590072631836, | |
| "sampling/importance_sampling_ratio/mean": 1.0218505859375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0378296375274658, | |
| "sampling/sampling_logp_difference/mean": 0.06926104426383972, | |
| "step": 58, | |
| "step_time": 13.15352440300012 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.019571688026189804, | |
| "clip_ratio/high_mean": 0.004892922006547451, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004892922006547451, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2832.0, | |
| "completions/max_terminated_length": 2832.0, | |
| "completions/mean_length": 2202.375, | |
| "completions/mean_terminated_length": 2202.375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.41450754553079605, | |
| "epoch": 0.00472, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 2.1054370403289795, | |
| "kl": 0.03926007356494665, | |
| "learning_rate": 5.999990575799961e-06, | |
| "loss": 0.0595, | |
| "num_tokens": 4446012.0, | |
| "reward": 0.44343751668930054, | |
| "reward_std": 0.13312500715255737, | |
| "rewards/rollout_reward_func/mean": 0.44343751668930054, | |
| "rewards/rollout_reward_func/std": 0.3428213894367218, | |
| "sampling/importance_sampling_ratio/max": 2.236393928527832, | |
| "sampling/importance_sampling_ratio/mean": 0.8590089678764343, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9236248731613159, | |
| "sampling/sampling_logp_difference/mean": 0.06904841959476471, | |
| "step": 59, | |
| "step_time": 13.58062329900008 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029240576550364494, | |
| "clip_ratio/high_mean": 0.007310144137591124, | |
| "clip_ratio/low_mean": 0.0030868902103975415, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010397034231573343, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2429.0, | |
| "completions/max_terminated_length": 2429.0, | |
| "completions/mean_length": 2124.9375, | |
| "completions/mean_terminated_length": 2124.9375, | |
| "completions/min_length": 1567.0, | |
| "completions/min_terminated_length": 1567.0, | |
| "entropy": 0.370839923620224, | |
| "epoch": 0.0048, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.5435988903045654, | |
| "kl": 0.08518982026726007, | |
| "learning_rate": 5.99998973848982e-06, | |
| "loss": -0.0579, | |
| "num_tokens": 4527051.0, | |
| "reward": 0.3590624928474426, | |
| "reward_std": 0.06796419620513916, | |
| "rewards/rollout_reward_func/mean": 0.3590624928474426, | |
| "rewards/rollout_reward_func/std": 0.22809672355651855, | |
| "sampling/importance_sampling_ratio/max": 2.2381787300109863, | |
| "sampling/importance_sampling_ratio/mean": 0.8449472188949585, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6413207054138184, | |
| "sampling/sampling_logp_difference/mean": 0.069917693734169, | |
| "step": 60, | |
| "step_time": 11.73221161399988 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02281746082007885, | |
| "clip_ratio/high_mean": 0.006954365293495357, | |
| "clip_ratio/low_mean": 0.0037499999161809683, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010704364976845682, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2767.0, | |
| "completions/max_terminated_length": 2767.0, | |
| "completions/mean_length": 1689.3125, | |
| "completions/mean_terminated_length": 1689.3125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.3677019253373146, | |
| "epoch": 0.00488, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.9829214811325073, | |
| "kl": 0.058779667131602764, | |
| "learning_rate": 5.999988865549569e-06, | |
| "loss": 0.0304, | |
| "num_tokens": 4593095.0, | |
| "reward": 0.6549999713897705, | |
| "reward_std": 0.22813192009925842, | |
| "rewards/rollout_reward_func/mean": 0.6549999713897705, | |
| "rewards/rollout_reward_func/std": 0.45271220803260803, | |
| "sampling/importance_sampling_ratio/max": 1.883159875869751, | |
| "sampling/importance_sampling_ratio/mean": 0.8463116884231567, | |
| "sampling/importance_sampling_ratio/min": 0.22824469208717346, | |
| "sampling/sampling_logp_difference/max": 1.781625747680664, | |
| "sampling/sampling_logp_difference/mean": 0.06828776746988297, | |
| "step": 61, | |
| "step_time": 12.480462479000153 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0110975606366992, | |
| "clip_ratio/high_mean": 0.0027743901591748, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004262485424987972, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2815.0, | |
| "completions/max_terminated_length": 2815.0, | |
| "completions/mean_length": 2305.15625, | |
| "completions/mean_terminated_length": 2305.15625, | |
| "completions/min_length": 1571.0, | |
| "completions/min_terminated_length": 1571.0, | |
| "entropy": 0.4012472406029701, | |
| "epoch": 0.00496, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8583229780197144, | |
| "kl": 0.06238031107932329, | |
| "learning_rate": 5.999987956979225e-06, | |
| "loss": -0.0392, | |
| "num_tokens": 4680377.0, | |
| "reward": 0.3434374928474426, | |
| "reward_std": 0.08029377460479736, | |
| "rewards/rollout_reward_func/mean": 0.3434374928474426, | |
| "rewards/rollout_reward_func/std": 0.22245851159095764, | |
| "sampling/importance_sampling_ratio/max": 2.7665059566497803, | |
| "sampling/importance_sampling_ratio/mean": 0.9882571697235107, | |
| "sampling/importance_sampling_ratio/min": 0.059778764843940735, | |
| "sampling/sampling_logp_difference/max": 0.9543299674987793, | |
| "sampling/sampling_logp_difference/mean": 0.0674777626991272, | |
| "step": 62, | |
| "step_time": 14.56621960199982 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05253623379394412, | |
| "clip_ratio/high_mean": 0.016606280929408967, | |
| "clip_ratio/low_mean": 0.0018382353009656072, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.018444516230374575, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2423.0, | |
| "completions/max_terminated_length": 2423.0, | |
| "completions/mean_length": 1968.9375, | |
| "completions/mean_terminated_length": 1968.9375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.4196172505617142, | |
| "epoch": 0.00504, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.905213475227356, | |
| "kl": 0.09828684013336897, | |
| "learning_rate": 5.999987012778799e-06, | |
| "loss": -0.0034, | |
| "num_tokens": 4755993.0, | |
| "reward": 0.33031249046325684, | |
| "reward_std": 0.09654378145933151, | |
| "rewards/rollout_reward_func/mean": 0.33031249046325684, | |
| "rewards/rollout_reward_func/std": 0.21877197921276093, | |
| "sampling/importance_sampling_ratio/max": 2.0344085693359375, | |
| "sampling/importance_sampling_ratio/mean": 0.7417819499969482, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.208054542541504, | |
| "sampling/sampling_logp_difference/mean": 0.07868118584156036, | |
| "step": 63, | |
| "step_time": 11.736785162999922 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0055555556900799274, | |
| "clip_ratio/high_mean": 0.0027777778450399637, | |
| "clip_ratio/low_mean": 0.003794643096625805, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006572420941665769, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2814.0, | |
| "completions/max_terminated_length": 2814.0, | |
| "completions/mean_length": 1972.9375, | |
| "completions/mean_terminated_length": 1972.9375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.4023704081773758, | |
| "epoch": 0.00512, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.3090136051177979, | |
| "kl": 0.12237261980772018, | |
| "learning_rate": 5.9999860329483104e-06, | |
| "loss": -0.194, | |
| "num_tokens": 4831827.0, | |
| "reward": 0.5878125429153442, | |
| "reward_std": 0.142506942152977, | |
| "rewards/rollout_reward_func/mean": 0.5878125429153442, | |
| "rewards/rollout_reward_func/std": 0.4488244950771332, | |
| "sampling/importance_sampling_ratio/max": 2.9826838970184326, | |
| "sampling/importance_sampling_ratio/mean": 0.9495848417282104, | |
| "sampling/importance_sampling_ratio/min": 0.01580546610057354, | |
| "sampling/sampling_logp_difference/max": 2.0490379333496094, | |
| "sampling/sampling_logp_difference/mean": 0.07465916872024536, | |
| "step": 64, | |
| "step_time": 12.846850890000042 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021152781788259745, | |
| "clip_ratio/high_mean": 0.006812585634179413, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006812585634179413, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2800.0, | |
| "completions/max_terminated_length": 2800.0, | |
| "completions/mean_length": 2005.0, | |
| "completions/mean_terminated_length": 2005.0, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.40342626720666885, | |
| "epoch": 0.0052, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.2556148767471313, | |
| "kl": 0.0746797863394022, | |
| "learning_rate": 5.999985017487771e-06, | |
| "loss": -0.0305, | |
| "num_tokens": 4908716.0, | |
| "reward": 0.3818749785423279, | |
| "reward_std": 0.15371949970722198, | |
| "rewards/rollout_reward_func/mean": 0.3818749785423279, | |
| "rewards/rollout_reward_func/std": 0.3037022650241852, | |
| "sampling/importance_sampling_ratio/max": 2.2475836277008057, | |
| "sampling/importance_sampling_ratio/mean": 0.8594139814376831, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.4058151245117188, | |
| "sampling/sampling_logp_difference/mean": 0.06716296076774597, | |
| "step": 65, | |
| "step_time": 12.423915739000222 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03018707549199462, | |
| "clip_ratio/high_mean": 0.007546768872998655, | |
| "clip_ratio/low_mean": 0.006225198740139604, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.013771967613138258, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2792.0, | |
| "completions/max_terminated_length": 2792.0, | |
| "completions/mean_length": 2048.40625, | |
| "completions/mean_terminated_length": 2048.40625, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.38859760761260986, | |
| "epoch": 0.00528, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.1451351642608643, | |
| "kl": 0.22367357090115547, | |
| "learning_rate": 5.999983966397197e-06, | |
| "loss": -0.1677, | |
| "num_tokens": 4987207.0, | |
| "reward": 0.4712499976158142, | |
| "reward_std": 0.18434235453605652, | |
| "rewards/rollout_reward_func/mean": 0.4712499976158142, | |
| "rewards/rollout_reward_func/std": 0.35322248935699463, | |
| "sampling/importance_sampling_ratio/max": 2.9422554969787598, | |
| "sampling/importance_sampling_ratio/mean": 0.9915428161621094, | |
| "sampling/importance_sampling_ratio/min": 0.016011416912078857, | |
| "sampling/sampling_logp_difference/max": 2.497363805770874, | |
| "sampling/sampling_logp_difference/mean": 0.07474374771118164, | |
| "step": 66, | |
| "step_time": 12.926716641999974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01376319769769907, | |
| "clip_ratio/high_mean": 0.005043363547883928, | |
| "clip_ratio/low_mean": 0.0014880952658131719, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0065314588136971, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2783.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 1772.40625, | |
| "completions/mean_terminated_length": 1772.40625, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.4034885838627815, | |
| "epoch": 0.00536, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.5488033294677734, | |
| "kl": 0.05104802828282118, | |
| "learning_rate": 5.999982879676608e-06, | |
| "loss": -0.041, | |
| "num_tokens": 5055760.0, | |
| "reward": 0.5737500190734863, | |
| "reward_std": 0.16325795650482178, | |
| "rewards/rollout_reward_func/mean": 0.5737500190734863, | |
| "rewards/rollout_reward_func/std": 0.40885162353515625, | |
| "sampling/importance_sampling_ratio/max": 2.278799057006836, | |
| "sampling/importance_sampling_ratio/mean": 0.99635910987854, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8835396766662598, | |
| "sampling/sampling_logp_difference/mean": 0.06713330745697021, | |
| "step": 67, | |
| "step_time": 12.46692701400002 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.004360465100035071, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004360465100035071, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2796.0, | |
| "completions/max_terminated_length": 2796.0, | |
| "completions/mean_length": 2270.40625, | |
| "completions/mean_terminated_length": 2270.40625, | |
| "completions/min_length": 1983.0, | |
| "completions/min_terminated_length": 1983.0, | |
| "entropy": 0.4274456053972244, | |
| "epoch": 0.00544, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8523308038711548, | |
| "kl": 0.13869191519916058, | |
| "learning_rate": 5.9999817573260195e-06, | |
| "loss": -0.1124, | |
| "num_tokens": 5141713.0, | |
| "reward": 0.2878125011920929, | |
| "reward_std": 0.01750694215297699, | |
| "rewards/rollout_reward_func/mean": 0.2878125011920929, | |
| "rewards/rollout_reward_func/std": 0.03849880024790764, | |
| "sampling/importance_sampling_ratio/max": 2.765258550643921, | |
| "sampling/importance_sampling_ratio/mean": 0.8237208127975464, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7204997539520264, | |
| "sampling/sampling_logp_difference/mean": 0.08386299759149551, | |
| "step": 68, | |
| "step_time": 12.989918530000296 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04849738674238324, | |
| "clip_ratio/high_mean": 0.016489425906911492, | |
| "clip_ratio/low_mean": 0.0013888889225199819, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.017878314713016152, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2441.0, | |
| "completions/max_terminated_length": 2441.0, | |
| "completions/mean_length": 1876.46875, | |
| "completions/mean_terminated_length": 1876.46875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.39091238379478455, | |
| "epoch": 0.00552, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.4318950176239014, | |
| "kl": 0.09389345720410347, | |
| "learning_rate": 5.999980599345448e-06, | |
| "loss": -0.0356, | |
| "num_tokens": 5214177.0, | |
| "reward": 0.5806249976158142, | |
| "reward_std": 0.07874999940395355, | |
| "rewards/rollout_reward_func/mean": 0.5806249976158142, | |
| "rewards/rollout_reward_func/std": 0.4241190552711487, | |
| "sampling/importance_sampling_ratio/max": 1.9308501482009888, | |
| "sampling/importance_sampling_ratio/mean": 0.9257422089576721, | |
| "sampling/importance_sampling_ratio/min": 0.21397040784358978, | |
| "sampling/sampling_logp_difference/max": 1.7655794620513916, | |
| "sampling/sampling_logp_difference/mean": 0.06729073822498322, | |
| "step": 69, | |
| "step_time": 11.706464537000102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.012202381156384945, | |
| "clip_ratio/high_mean": 0.0030505952890962362, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0030505952890962362, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2429.0, | |
| "completions/max_terminated_length": 2429.0, | |
| "completions/mean_length": 1901.125, | |
| "completions/mean_terminated_length": 1901.125, | |
| "completions/min_length": 1056.0, | |
| "completions/min_terminated_length": 1056.0, | |
| "entropy": 0.4174434766173363, | |
| "epoch": 0.0056, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.2482589483261108, | |
| "kl": 0.0709009887650609, | |
| "learning_rate": 5.999979405734914e-06, | |
| "loss": -0.0875, | |
| "num_tokens": 5287259.0, | |
| "reward": 0.44999998807907104, | |
| "reward_std": 0.19828803837299347, | |
| "rewards/rollout_reward_func/mean": 0.44999998807907104, | |
| "rewards/rollout_reward_func/std": 0.36232221126556396, | |
| "sampling/importance_sampling_ratio/max": 2.240993022918701, | |
| "sampling/importance_sampling_ratio/mean": 0.7815386652946472, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.9302306175231934, | |
| "sampling/sampling_logp_difference/mean": 0.07109043747186661, | |
| "step": 70, | |
| "step_time": 11.92579563199979 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03573596617206931, | |
| "clip_ratio/high_mean": 0.011878836317919195, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01361494732555002, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2800.0, | |
| "completions/max_terminated_length": 2800.0, | |
| "completions/mean_length": 2164.34375, | |
| "completions/mean_terminated_length": 2164.34375, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.4176176115870476, | |
| "epoch": 0.00568, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.604291319847107, | |
| "kl": 0.0694936579093337, | |
| "learning_rate": 5.999978176494435e-06, | |
| "loss": -0.1233, | |
| "num_tokens": 5369772.0, | |
| "reward": 0.4762499928474426, | |
| "reward_std": 0.21075797080993652, | |
| "rewards/rollout_reward_func/mean": 0.4762499928474426, | |
| "rewards/rollout_reward_func/std": 0.3796156644821167, | |
| "sampling/importance_sampling_ratio/max": 2.497992992401123, | |
| "sampling/importance_sampling_ratio/mean": 0.8422503471374512, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5480012893676758, | |
| "sampling/sampling_logp_difference/mean": 0.0728713721036911, | |
| "step": 71, | |
| "step_time": 12.894382659999792 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02281746082007885, | |
| "clip_ratio/high_mean": 0.008831319864839315, | |
| "clip_ratio/low_mean": 0.003260501311160624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.012091821059584618, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2443.0, | |
| "completions/max_terminated_length": 2443.0, | |
| "completions/mean_length": 1719.03125, | |
| "completions/mean_terminated_length": 1719.03125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.39671653509140015, | |
| "epoch": 0.00576, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.5779305696487427, | |
| "kl": 0.08434087503701448, | |
| "learning_rate": 5.99997691162403e-06, | |
| "loss": -0.0636, | |
| "num_tokens": 5436596.0, | |
| "reward": 0.6024999618530273, | |
| "reward_std": 0.2729267477989197, | |
| "rewards/rollout_reward_func/mean": 0.6024999618530273, | |
| "rewards/rollout_reward_func/std": 0.4505694806575775, | |
| "sampling/importance_sampling_ratio/max": 2.754258632659912, | |
| "sampling/importance_sampling_ratio/mean": 0.9120872020721436, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.2350893020629883, | |
| "sampling/sampling_logp_difference/mean": 0.07831829786300659, | |
| "step": 72, | |
| "step_time": 12.011820702999785 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0034722222480922937, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005208333372138441, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2780.0, | |
| "completions/max_terminated_length": 2780.0, | |
| "completions/mean_length": 1720.125, | |
| "completions/mean_terminated_length": 1720.125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.35359790176153183, | |
| "epoch": 0.00584, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.760622262954712, | |
| "kl": 0.0876467265188694, | |
| "learning_rate": 5.99997561112372e-06, | |
| "loss": -0.0248, | |
| "num_tokens": 5503333.0, | |
| "reward": 0.7487499713897705, | |
| "reward_std": 0.2987908720970154, | |
| "rewards/rollout_reward_func/mean": 0.7487499713897705, | |
| "rewards/rollout_reward_func/std": 0.46135663986206055, | |
| "sampling/importance_sampling_ratio/max": 2.7877893447875977, | |
| "sampling/importance_sampling_ratio/mean": 0.8998199701309204, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3734312057495117, | |
| "sampling/sampling_logp_difference/mean": 0.07869358360767365, | |
| "step": 73, | |
| "step_time": 13.119324159999906 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013701201416552067, | |
| "clip_ratio/high_mean": 0.005070037324912846, | |
| "clip_ratio/low_mean": 0.0030487803742289543, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008118817582726479, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2825.0, | |
| "completions/max_terminated_length": 2825.0, | |
| "completions/mean_length": 1916.5, | |
| "completions/mean_terminated_length": 1916.5, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.38354693353176117, | |
| "epoch": 0.00592, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.8614935874938965, | |
| "kl": 0.05992862023413181, | |
| "learning_rate": 5.999974274993527e-06, | |
| "loss": 0.012, | |
| "num_tokens": 5576952.0, | |
| "reward": 0.5674999952316284, | |
| "reward_std": 0.20719751715660095, | |
| "rewards/rollout_reward_func/mean": 0.5674999952316284, | |
| "rewards/rollout_reward_func/std": 0.4317331612110138, | |
| "sampling/importance_sampling_ratio/max": 1.6715911626815796, | |
| "sampling/importance_sampling_ratio/mean": 0.8377959132194519, | |
| "sampling/importance_sampling_ratio/min": 0.196980819106102, | |
| "sampling/sampling_logp_difference/max": 0.913780689239502, | |
| "sampling/sampling_logp_difference/mean": 0.06950188428163528, | |
| "step": 74, | |
| "step_time": 12.793109332000085 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2785.0, | |
| "completions/max_terminated_length": 2785.0, | |
| "completions/mean_length": 1587.53125, | |
| "completions/mean_terminated_length": 1587.53125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.35842984169721603, | |
| "epoch": 0.006, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.3697266578674316, | |
| "kl": 0.05413582641631365, | |
| "learning_rate": 5.99997290323347e-06, | |
| "loss": -0.0661, | |
| "num_tokens": 5639305.0, | |
| "reward": 0.7106249928474426, | |
| "reward_std": 0.30803900957107544, | |
| "rewards/rollout_reward_func/mean": 0.7106249928474426, | |
| "rewards/rollout_reward_func/std": 0.4498884081840515, | |
| "sampling/importance_sampling_ratio/max": 1.9579815864562988, | |
| "sampling/importance_sampling_ratio/mean": 0.8830677270889282, | |
| "sampling/importance_sampling_ratio/min": 0.15943297743797302, | |
| "sampling/sampling_logp_difference/max": 1.0672590732574463, | |
| "sampling/sampling_logp_difference/mean": 0.06444922834634781, | |
| "step": 75, | |
| "step_time": 12.032428736000156 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_clip_ratio/high_max": 0.0, | |
| "eval_clip_ratio/high_mean": 0.0, | |
| "eval_clip_ratio/low_mean": 0.0, | |
| "eval_clip_ratio/low_min": 0.0, | |
| "eval_clip_ratio/region_mean": 0.0, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 2488.6, | |
| "eval_completions/max_terminated_length": 2488.6, | |
| "eval_completions/mean_length": 1951.175, | |
| "eval_completions/mean_terminated_length": 1951.175, | |
| "eval_completions/min_length": 1361.4, | |
| "eval_completions/min_terminated_length": 1361.4, | |
| "eval_entropy": 0.37236364781856535, | |
| "eval_frac_reward_zero_std": 0.1, | |
| "eval_kl": 0.07629953697323799, | |
| "eval_loss": -0.0013724860036745667, | |
| "eval_num_tokens": 5639305.0, | |
| "eval_reward": 0.47524999976158144, | |
| "eval_reward_std": 0.37039353847503664, | |
| "eval_rewards/rollout_reward_func/mean": 0.47524999976158144, | |
| "eval_rewards/rollout_reward_func/std": 0.37039353176951406, | |
| "eval_runtime": 10.5117, | |
| "eval_samples_per_second": 0.951, | |
| "eval_sampling/importance_sampling_ratio/max": 1.6587244033813477, | |
| "eval_sampling/importance_sampling_ratio/mean": 0.8837172389030457, | |
| "eval_sampling/importance_sampling_ratio/min": 0.32487900257110597, | |
| "eval_sampling/sampling_logp_difference/max": 0.8625046908855438, | |
| "eval_sampling/sampling_logp_difference/mean": 0.06909476891160012, | |
| "eval_steps_per_second": 0.285, | |
| "step": 75 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 25000, | |
| "num_input_tokens_seen": 5639305, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |