Instructions to use Gege24/environment-zay with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/environment-zay with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct") model = PeftModel.from_pretrained(base_model, "Gege24/environment-zay") - Transformers
How to use Gege24/environment-zay with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/environment-zay") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Gege24/environment-zay", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gege24/environment-zay with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/environment-zay" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/environment-zay", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/environment-zay
- SGLang
How to use Gege24/environment-zay with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/environment-zay" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/environment-zay", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/environment-zay" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/environment-zay", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/environment-zay with Docker Model Runner:
docker model run hf.co/Gege24/environment-zay
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9609.0, | |
| "completions/max_terminated_length": 9609.0, | |
| "completions/mean_length": 7527.34375, | |
| "completions/mean_terminated_length": 7527.34375, | |
| "completions/min_length": 2464.0, | |
| "completions/min_terminated_length": 2464.0, | |
| "entropy": 0.31753343041054904, | |
| "epoch": 0.008, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4123356342315674, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.305, | |
| "num_tokens": 263947.0, | |
| "reward": 0.031437501311302185, | |
| "reward_std": 0.22218962013721466, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.031437501311302185, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3469863533973694, | |
| "sampling/importance_sampling_ratio/max": 1.4741449356079102, | |
| "sampling/importance_sampling_ratio/mean": 0.29983627796173096, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5447624921798706, | |
| "sampling/sampling_logp_difference/mean": 0.018553823232650757, | |
| "step": 1, | |
| "step_time": 335.8077390380013 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9708.0, | |
| "completions/max_terminated_length": 9708.0, | |
| "completions/mean_length": 7452.15625, | |
| "completions/mean_terminated_length": 7452.15625, | |
| "completions/min_length": 1958.0, | |
| "completions/min_terminated_length": 1958.0, | |
| "entropy": 0.26579508977010846, | |
| "epoch": 0.016, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6812145113945007, | |
| "kl": 0.0, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.3284, | |
| "num_tokens": 528400.0, | |
| "reward": 0.23356249928474426, | |
| "reward_std": 0.47549036145210266, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.23356249928474426, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5023998618125916, | |
| "sampling/importance_sampling_ratio/max": 2.4382495880126953, | |
| "sampling/importance_sampling_ratio/mean": 0.5181977152824402, | |
| "sampling/importance_sampling_ratio/min": 0.00792621448636055, | |
| "sampling/sampling_logp_difference/max": 2.3046507835388184, | |
| "sampling/sampling_logp_difference/mean": 0.017045794054865837, | |
| "step": 2, | |
| "step_time": 322.64169866899965 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7940.0, | |
| "completions/max_terminated_length": 7940.0, | |
| "completions/mean_length": 5720.75, | |
| "completions/mean_terminated_length": 5720.75, | |
| "completions/min_length": 1041.0, | |
| "completions/min_terminated_length": 1041.0, | |
| "entropy": 0.39676310354843736, | |
| "epoch": 0.024, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5915209054946899, | |
| "kl": 0.0016266869151877472, | |
| "learning_rate": 1.25e-05, | |
| "loss": -0.5029, | |
| "num_tokens": 730920.0, | |
| "reward": 0.14431250095367432, | |
| "reward_std": 0.4770090878009796, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.14431250095367432, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4686758518218994, | |
| "sampling/importance_sampling_ratio/max": 2.4560086727142334, | |
| "sampling/importance_sampling_ratio/mean": 0.37546274065971375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5112149715423584, | |
| "sampling/sampling_logp_difference/mean": 0.021917540580034256, | |
| "step": 3, | |
| "step_time": 280.4968342440011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10155.0, | |
| "completions/max_terminated_length": 10155.0, | |
| "completions/mean_length": 8253.03125, | |
| "completions/mean_terminated_length": 8253.03125, | |
| "completions/min_length": 4463.0, | |
| "completions/min_terminated_length": 4463.0, | |
| "entropy": 0.3865774553269148, | |
| "epoch": 0.032, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6168197989463806, | |
| "kl": 0.0021592163539025933, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.5303, | |
| "num_tokens": 1018057.0, | |
| "reward": -0.05943749472498894, | |
| "reward_std": 0.17190764844417572, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.05943749472498894, | |
| "rewards/alfworld_rollout_reward_func/std": 0.2674463391304016, | |
| "sampling/importance_sampling_ratio/max": 2.158449172973633, | |
| "sampling/importance_sampling_ratio/mean": 0.3775303363800049, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5581846237182617, | |
| "sampling/sampling_logp_difference/mean": 0.02310691960155964, | |
| "step": 4, | |
| "step_time": 360.68988642800105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5661.0, | |
| "completions/max_terminated_length": 5661.0, | |
| "completions/mean_length": 4748.75, | |
| "completions/mean_terminated_length": 4748.75, | |
| "completions/min_length": 3574.0, | |
| "completions/min_terminated_length": 3574.0, | |
| "entropy": 0.3531342991627753, | |
| "epoch": 0.04, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4972003400325775, | |
| "kl": 0.0017817301231843885, | |
| "learning_rate": 2.5e-05, | |
| "loss": -0.1008, | |
| "num_tokens": 1186209.0, | |
| "reward": -0.03831250220537186, | |
| "reward_std": 0.16514073312282562, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.03831250220537186, | |
| "rewards/alfworld_rollout_reward_func/std": 0.263570100069046, | |
| "sampling/importance_sampling_ratio/max": 1.5651471614837646, | |
| "sampling/importance_sampling_ratio/mean": 0.23428680002689362, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.689125061035156, | |
| "sampling/sampling_logp_difference/mean": 0.02276257984340191, | |
| "step": 5, | |
| "step_time": 235.26227801500136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 11864.0, | |
| "completions/max_terminated_length": 11864.0, | |
| "completions/mean_length": 9544.53125, | |
| "completions/mean_terminated_length": 9544.53125, | |
| "completions/min_length": 2217.0, | |
| "completions/min_terminated_length": 2217.0, | |
| "entropy": 0.3936329837888479, | |
| "epoch": 0.048, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4895797669887543, | |
| "kl": 0.0018409217773296405, | |
| "learning_rate": 2.4995787066293908e-05, | |
| "loss": 0.0504, | |
| "num_tokens": 1519058.0, | |
| "reward": 0.08924999833106995, | |
| "reward_std": 0.3558388352394104, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.08924999833106995, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4207598865032196, | |
| "sampling/importance_sampling_ratio/max": 2.6714699268341064, | |
| "sampling/importance_sampling_ratio/mean": 0.5432157516479492, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0583038330078125, | |
| "sampling/sampling_logp_difference/mean": 0.022378364577889442, | |
| "step": 6, | |
| "step_time": 440.67857990999437 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9557.0, | |
| "completions/max_terminated_length": 9557.0, | |
| "completions/mean_length": 8216.96875, | |
| "completions/mean_terminated_length": 8216.96875, | |
| "completions/min_length": 7643.0, | |
| "completions/min_terminated_length": 7643.0, | |
| "entropy": 0.3345654481090605, | |
| "epoch": 0.056, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.36864349246025085, | |
| "kl": 0.001815770137909567, | |
| "learning_rate": 2.498315110498529e-05, | |
| "loss": -0.6279, | |
| "num_tokens": 1805009.0, | |
| "reward": -0.07706249505281448, | |
| "reward_std": 0.10743739455938339, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.07706249505281448, | |
| "rewards/alfworld_rollout_reward_func/std": 0.18897344172000885, | |
| "sampling/importance_sampling_ratio/max": 2.8883540630340576, | |
| "sampling/importance_sampling_ratio/mean": 0.3507537841796875, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.560537338256836, | |
| "sampling/sampling_logp_difference/mean": 0.019598914310336113, | |
| "step": 7, | |
| "step_time": 352.1544552209989 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5112.0, | |
| "completions/max_terminated_length": 5112.0, | |
| "completions/mean_length": 4309.6875, | |
| "completions/mean_terminated_length": 4309.6875, | |
| "completions/min_length": 1531.0, | |
| "completions/min_terminated_length": 1531.0, | |
| "entropy": 0.3713793349452317, | |
| "epoch": 0.064, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6283824443817139, | |
| "kl": 0.0014209353576006833, | |
| "learning_rate": 2.496210063358892e-05, | |
| "loss": 0.6368, | |
| "num_tokens": 1958983.0, | |
| "reward": 0.05275000259280205, | |
| "reward_std": 0.3398139476776123, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.05275000259280205, | |
| "rewards/alfworld_rollout_reward_func/std": 0.38452139496803284, | |
| "sampling/importance_sampling_ratio/max": 2.513087272644043, | |
| "sampling/importance_sampling_ratio/mean": 0.4636477828025818, | |
| "sampling/importance_sampling_ratio/min": 0.007982950657606125, | |
| "sampling/sampling_logp_difference/max": 1.4883854389190674, | |
| "sampling/sampling_logp_difference/mean": 0.019389096647500992, | |
| "step": 8, | |
| "step_time": 223.46994931800418 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9707.0, | |
| "completions/max_terminated_length": 9707.0, | |
| "completions/mean_length": 6891.5625, | |
| "completions/mean_terminated_length": 6891.5625, | |
| "completions/min_length": 1846.0, | |
| "completions/min_terminated_length": 1846.0, | |
| "entropy": 0.3352759047411382, | |
| "epoch": 0.072, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5288219451904297, | |
| "kl": 0.0022124643182905857, | |
| "learning_rate": 2.4932649841583266e-05, | |
| "loss": -0.19, | |
| "num_tokens": 2205465.0, | |
| "reward": 0.26093751192092896, | |
| "reward_std": 0.539341151714325, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.26093751192092896, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5594924092292786, | |
| "sampling/importance_sampling_ratio/max": 2.794914960861206, | |
| "sampling/importance_sampling_ratio/mean": 0.49680012464523315, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 46.763362884521484, | |
| "sampling/sampling_logp_difference/mean": 0.023306839168071747, | |
| "step": 9, | |
| "step_time": 329.5470013760005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10376.0, | |
| "completions/max_terminated_length": 10376.0, | |
| "completions/mean_length": 8182.25, | |
| "completions/mean_terminated_length": 8182.25, | |
| "completions/min_length": 5768.0, | |
| "completions/min_terminated_length": 5768.0, | |
| "entropy": 0.3423183555714786, | |
| "epoch": 0.08, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 500.11370849609375, | |
| "kl": 1.0201642424835882, | |
| "learning_rate": 2.489481858084583e-05, | |
| "loss": 0.1234, | |
| "num_tokens": 2489921.0, | |
| "reward": -0.07881250232458115, | |
| "reward_std": 0.10148754715919495, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.07881250232458115, | |
| "rewards/alfworld_rollout_reward_func/std": 0.18484507501125336, | |
| "sampling/importance_sampling_ratio/max": 2.4246673583984375, | |
| "sampling/importance_sampling_ratio/mean": 0.40079888701438904, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 49.023101806640625, | |
| "sampling/sampling_logp_difference/mean": 0.03266483545303345, | |
| "step": 10, | |
| "step_time": 350.9630481339973 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5637.0, | |
| "completions/max_terminated_length": 5637.0, | |
| "completions/mean_length": 5241.5625, | |
| "completions/mean_terminated_length": 5241.5625, | |
| "completions/min_length": 4864.0, | |
| "completions/min_terminated_length": 4864.0, | |
| "entropy": 0.36981488950550556, | |
| "epoch": 0.088, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5718499422073364, | |
| "kl": 0.0016583001779508777, | |
| "learning_rate": 2.4848632352271566e-05, | |
| "loss": -1.1019, | |
| "num_tokens": 2674899.0, | |
| "reward": -0.11343749612569809, | |
| "reward_std": 0.04205840826034546, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.11343749612569809, | |
| "rewards/alfworld_rollout_reward_func/std": 0.04254859685897827, | |
| "sampling/importance_sampling_ratio/max": 2.287071704864502, | |
| "sampling/importance_sampling_ratio/mean": 0.5078843832015991, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.9416933059692383, | |
| "sampling/sampling_logp_difference/mean": 0.019637946039438248, | |
| "step": 11, | |
| "step_time": 247.4129799650018 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9350.0, | |
| "completions/max_terminated_length": 9350.0, | |
| "completions/mean_length": 8717.375, | |
| "completions/mean_terminated_length": 8717.375, | |
| "completions/min_length": 8325.0, | |
| "completions/min_terminated_length": 8325.0, | |
| "entropy": 0.3274609283544123, | |
| "epoch": 0.096, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6321223974227905, | |
| "kl": 0.0023520230570284184, | |
| "learning_rate": 2.4794122288583533e-05, | |
| "loss": 0.8207, | |
| "num_tokens": 2978463.0, | |
| "reward": -0.10750000178813934, | |
| "reward_std": 0.034280747175216675, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.10750000178813934, | |
| "rewards/alfworld_rollout_reward_func/std": 0.03943144157528877, | |
| "sampling/importance_sampling_ratio/max": 2.627840042114258, | |
| "sampling/importance_sampling_ratio/mean": 0.43182629346847534, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2869181632995605, | |
| "sampling/sampling_logp_difference/mean": 0.019458087161183357, | |
| "step": 12, | |
| "step_time": 362.32430394600306 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 11219.0, | |
| "completions/max_terminated_length": 11219.0, | |
| "completions/mean_length": 8425.9375, | |
| "completions/mean_terminated_length": 8425.9375, | |
| "completions/min_length": 6101.0, | |
| "completions/min_terminated_length": 6101.0, | |
| "entropy": 0.27532787807285786, | |
| "epoch": 0.104, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5922659635543823, | |
| "kl": 0.0038648816698696464, | |
| "learning_rate": 2.4731325133347272e-05, | |
| "loss": -0.0541, | |
| "num_tokens": 3272573.0, | |
| "reward": 0.062562495470047, | |
| "reward_std": 0.22287124395370483, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.062562495470047, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3846460282802582, | |
| "sampling/importance_sampling_ratio/max": 2.459491729736328, | |
| "sampling/importance_sampling_ratio/mean": 0.7038756608963013, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.8825843334198, | |
| "sampling/sampling_logp_difference/mean": 0.017496587708592415, | |
| "step": 13, | |
| "step_time": 377.7220775159976 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5013.0, | |
| "completions/max_terminated_length": 5013.0, | |
| "completions/mean_length": 3482.0625, | |
| "completions/mean_terminated_length": 3482.0625, | |
| "completions/min_length": 524.0, | |
| "completions/min_terminated_length": 524.0, | |
| "entropy": 0.35823827097192407, | |
| "epoch": 0.112, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7376836538314819, | |
| "kl": 0.003011322856764309, | |
| "learning_rate": 2.466028321620309e-05, | |
| "loss": 0.6336, | |
| "num_tokens": 3399487.0, | |
| "reward": 0.39381250739097595, | |
| "reward_std": 0.507357120513916, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.39381250739097595, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5500279068946838, | |
| "sampling/importance_sampling_ratio/max": 2.5899088382720947, | |
| "sampling/importance_sampling_ratio/mean": 0.6255139708518982, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7760124206542969, | |
| "sampling/sampling_logp_difference/mean": 0.021579492837190628, | |
| "step": 14, | |
| "step_time": 182.0329440820051 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9315.0, | |
| "completions/max_terminated_length": 9315.0, | |
| "completions/mean_length": 8675.8125, | |
| "completions/mean_terminated_length": 8675.8125, | |
| "completions/min_length": 7983.0, | |
| "completions/min_terminated_length": 7983.0, | |
| "entropy": 0.29767332202754915, | |
| "epoch": 0.12, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6271937489509583, | |
| "kl": 0.002263592203235021, | |
| "learning_rate": 2.4581044424332964e-05, | |
| "loss": 0.0795, | |
| "num_tokens": 3701529.0, | |
| "reward": -0.08531250059604645, | |
| "reward_std": 0.024405591189861298, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.08531250059604645, | |
| "rewards/alfworld_rollout_reward_func/std": 0.024884814396500587, | |
| "sampling/importance_sampling_ratio/max": 2.4757065773010254, | |
| "sampling/importance_sampling_ratio/mean": 0.5632457137107849, | |
| "sampling/importance_sampling_ratio/min": 0.021398449316620827, | |
| "sampling/sampling_logp_difference/max": 1.3662091493606567, | |
| "sampling/sampling_logp_difference/mean": 0.018054665997624397, | |
| "step": 15, | |
| "step_time": 357.9417388269985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5504.0, | |
| "completions/max_terminated_length": 5504.0, | |
| "completions/mean_length": 2116.59375, | |
| "completions/mean_terminated_length": 2116.59375, | |
| "completions/min_length": 705.0, | |
| "completions/min_terminated_length": 705.0, | |
| "entropy": 0.3064890103414655, | |
| "epoch": 0.128, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3754582107067108, | |
| "kl": 0.0037364878353400854, | |
| "learning_rate": 2.449366217018122e-05, | |
| "loss": 0.067, | |
| "num_tokens": 3784812.0, | |
| "reward": 0.7191874980926514, | |
| "reward_std": 0.4386064410209656, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.7191874980926514, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4786539673805237, | |
| "sampling/importance_sampling_ratio/max": 2.0642874240875244, | |
| "sampling/importance_sampling_ratio/mean": 0.6617263555526733, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3662114143371582, | |
| "sampling/sampling_logp_difference/mean": 0.021964222192764282, | |
| "step": 16, | |
| "step_time": 139.03199604900237 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5829.0, | |
| "completions/max_terminated_length": 5829.0, | |
| "completions/mean_length": 4871.875, | |
| "completions/mean_terminated_length": 4871.875, | |
| "completions/min_length": 1450.0, | |
| "completions/min_terminated_length": 1450.0, | |
| "entropy": 0.3853617487475276, | |
| "epoch": 0.136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6372714042663574, | |
| "kl": 0.002946533808426466, | |
| "learning_rate": 2.439819535545087e-05, | |
| "loss": 0.6343, | |
| "num_tokens": 3958504.0, | |
| "reward": -0.002124996855854988, | |
| "reward_std": 0.22313132882118225, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.002124996855854988, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3138851225376129, | |
| "sampling/importance_sampling_ratio/max": 2.760993719100952, | |
| "sampling/importance_sampling_ratio/mean": 0.6022263169288635, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7812860012054443, | |
| "sampling/sampling_logp_difference/mean": 0.02008485235273838, | |
| "step": 17, | |
| "step_time": 219.52375941698938 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 12497.0, | |
| "completions/max_terminated_length": 12497.0, | |
| "completions/mean_length": 10824.96875, | |
| "completions/mean_terminated_length": 10824.96875, | |
| "completions/min_length": 2927.0, | |
| "completions/min_terminated_length": 2927.0, | |
| "entropy": 0.3916443451307714, | |
| "epoch": 0.144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5299886465072632, | |
| "kl": 0.004217182104184758, | |
| "learning_rate": 2.4294708331399775e-05, | |
| "loss": -0.6161, | |
| "num_tokens": 4332871.0, | |
| "reward": -0.01993749663233757, | |
| "reward_std": 0.21947234869003296, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.01993749663233757, | |
| "rewards/alfworld_rollout_reward_func/std": 0.30789878964424133, | |
| "sampling/importance_sampling_ratio/max": 2.8617172241210938, | |
| "sampling/importance_sampling_ratio/mean": 0.43398600816726685, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.6485512256622314, | |
| "sampling/sampling_logp_difference/mean": 0.023952314630150795, | |
| "step": 18, | |
| "step_time": 453.0025306710031 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6249.0, | |
| "completions/max_terminated_length": 6249.0, | |
| "completions/mean_length": 5000.625, | |
| "completions/mean_terminated_length": 5000.625, | |
| "completions/min_length": 3696.0, | |
| "completions/min_terminated_length": 3696.0, | |
| "entropy": 0.43349673599004745, | |
| "epoch": 0.152, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5063394904136658, | |
| "kl": 0.003751923381059896, | |
| "learning_rate": 2.4183270855463413e-05, | |
| "loss": -0.0025, | |
| "num_tokens": 4508859.0, | |
| "reward": -0.05518750101327896, | |
| "reward_std": 0.09173674881458282, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.05518750101327896, | |
| "rewards/alfworld_rollout_reward_func/std": 0.18693235516548157, | |
| "sampling/importance_sampling_ratio/max": 2.7608871459960938, | |
| "sampling/importance_sampling_ratio/mean": 0.3959204852581024, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.0717275142669678, | |
| "sampling/sampling_logp_difference/mean": 0.02457268536090851, | |
| "step": 19, | |
| "step_time": 232.94531282299977 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 11351.0, | |
| "completions/max_terminated_length": 11351.0, | |
| "completions/mean_length": 10259.65625, | |
| "completions/mean_terminated_length": 10259.65625, | |
| "completions/min_length": 6872.0, | |
| "completions/min_terminated_length": 6872.0, | |
| "entropy": 0.3433692827820778, | |
| "epoch": 0.16, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2678588032722473, | |
| "kl": 0.00476062010784517, | |
| "learning_rate": 2.406395804423355e-05, | |
| "loss": -0.1127, | |
| "num_tokens": 4865168.0, | |
| "reward": -0.010062501765787601, | |
| "reward_std": 0.20959903299808502, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.010062501765787601, | |
| "rewards/alfworld_rollout_reward_func/std": 0.29610002040863037, | |
| "sampling/importance_sampling_ratio/max": 1.9335780143737793, | |
| "sampling/importance_sampling_ratio/mean": 0.2613844871520996, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.60330331325531, | |
| "sampling/sampling_logp_difference/mean": 0.02182953618466854, | |
| "step": 20, | |
| "step_time": 411.72310698399815 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5627.0, | |
| "completions/max_terminated_length": 5627.0, | |
| "completions/mean_length": 3213.1875, | |
| "completions/mean_terminated_length": 3213.1875, | |
| "completions/min_length": 616.0, | |
| "completions/min_terminated_length": 616.0, | |
| "entropy": 0.3132283384911716, | |
| "epoch": 0.168, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28126800060272217, | |
| "kl": 0.006387614266714081, | |
| "learning_rate": 2.3936850322824417e-05, | |
| "loss": 0.0117, | |
| "num_tokens": 4985238.0, | |
| "reward": 0.6055624485015869, | |
| "reward_std": 0.48004958033561707, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6055624485015869, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5068143606185913, | |
| "sampling/importance_sampling_ratio/max": 2.4571378231048584, | |
| "sampling/importance_sampling_ratio/mean": 0.5615379810333252, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6348450183868408, | |
| "sampling/sampling_logp_difference/mean": 0.020369766280055046, | |
| "step": 21, | |
| "step_time": 166.97806567500083 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8290.0, | |
| "completions/max_terminated_length": 8290.0, | |
| "completions/mean_length": 6657.03125, | |
| "completions/mean_terminated_length": 6657.03125, | |
| "completions/min_length": 1775.0, | |
| "completions/min_terminated_length": 1775.0, | |
| "entropy": 0.32329121232032776, | |
| "epoch": 0.176, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.294430673122406, | |
| "kl": 0.0053747415950056165, | |
| "learning_rate": 2.380203337066063e-05, | |
| "loss": 0.0999, | |
| "num_tokens": 5218679.0, | |
| "reward": 0.0065000057220458984, | |
| "reward_std": 0.23578479886054993, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.0065000057220458984, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3685416877269745, | |
| "sampling/importance_sampling_ratio/max": 2.0549917221069336, | |
| "sampling/importance_sampling_ratio/mean": 0.4458431601524353, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.611385464668274, | |
| "sampling/sampling_logp_difference/mean": 0.019326191395521164, | |
| "step": 22, | |
| "step_time": 299.1344787260059 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5289.0, | |
| "completions/max_terminated_length": 5289.0, | |
| "completions/mean_length": 4233.375, | |
| "completions/mean_terminated_length": 4233.375, | |
| "completions/min_length": 535.0, | |
| "completions/min_terminated_length": 535.0, | |
| "entropy": 0.38607919216156006, | |
| "epoch": 0.184, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5484526753425598, | |
| "kl": 0.004964196581568103, | |
| "learning_rate": 2.36595980637233e-05, | |
| "loss": 0.1743, | |
| "num_tokens": 5371395.0, | |
| "reward": 0.25599998235702515, | |
| "reward_std": 0.5028968453407288, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.25599998235702515, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5062468647956848, | |
| "sampling/importance_sampling_ratio/max": 1.7829545736312866, | |
| "sampling/importance_sampling_ratio/mean": 0.5887770652770996, | |
| "sampling/importance_sampling_ratio/min": 0.008336997590959072, | |
| "sampling/sampling_logp_difference/max": 2.665585517883301, | |
| "sampling/sampling_logp_difference/mean": 0.02209375984966755, | |
| "step": 23, | |
| "step_time": 205.04334290899897 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 5275.0, | |
| "completions/max_terminated_length": 4975.0, | |
| "completions/mean_length": 3625.5, | |
| "completions/mean_terminated_length": 3572.290283203125, | |
| "completions/min_length": 503.0, | |
| "completions/min_terminated_length": 503.0, | |
| "entropy": 0.3871590462513268, | |
| "epoch": 0.192, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6740729212760925, | |
| "kl": 0.005397088676545536, | |
| "learning_rate": 2.3509640413293303e-05, | |
| "loss": 1.3846, | |
| "num_tokens": 5503731.0, | |
| "reward": 0.45381247997283936, | |
| "reward_std": 0.5777146220207214, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.45381247997283936, | |
| "rewards/alfworld_rollout_reward_func/std": 0.538960337638855, | |
| "sampling/importance_sampling_ratio/max": 2.8123936653137207, | |
| "sampling/importance_sampling_ratio/mean": 0.6568740606307983, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7232227325439453, | |
| "sampling/sampling_logp_difference/mean": 0.019891591742634773, | |
| "step": 24, | |
| "step_time": 187.47785765799927 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8448.0, | |
| "completions/max_terminated_length": 8448.0, | |
| "completions/mean_length": 7377.09375, | |
| "completions/mean_terminated_length": 7377.09375, | |
| "completions/min_length": 2312.0, | |
| "completions/min_terminated_length": 2312.0, | |
| "entropy": 0.35573973087593913, | |
| "epoch": 0.2, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6313058733940125, | |
| "kl": 0.006160886805446353, | |
| "learning_rate": 2.335226150123305e-05, | |
| "loss": 0.5233, | |
| "num_tokens": 5762006.0, | |
| "reward": -0.02968749776482582, | |
| "reward_std": 0.1530226618051529, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.02968749776482582, | |
| "rewards/alfworld_rollout_reward_func/std": 0.25761842727661133, | |
| "sampling/importance_sampling_ratio/max": 2.5971615314483643, | |
| "sampling/importance_sampling_ratio/mean": 0.6694207191467285, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.766273856163025, | |
| "sampling/sampling_logp_difference/mean": 0.022114556282758713, | |
| "step": 25, | |
| "step_time": 308.79034867000337 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10749.0, | |
| "completions/max_terminated_length": 10749.0, | |
| "completions/mean_length": 9741.78125, | |
| "completions/mean_terminated_length": 9741.78125, | |
| "completions/min_length": 7678.0, | |
| "completions/min_terminated_length": 7678.0, | |
| "entropy": 0.3445580299012363, | |
| "epoch": 0.208, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6585941314697266, | |
| "kl": 0.006797554538934492, | |
| "learning_rate": 2.3187567411850253e-05, | |
| "loss": -0.128, | |
| "num_tokens": 6100879.0, | |
| "reward": -0.049687497317790985, | |
| "reward_std": 0.15042373538017273, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.049687497317790985, | |
| "rewards/alfworld_rollout_reward_func/std": 0.24326865375041962, | |
| "sampling/importance_sampling_ratio/max": 2.6634747982025146, | |
| "sampling/importance_sampling_ratio/mean": 0.7100951075553894, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.047456741333008, | |
| "sampling/sampling_logp_difference/mean": 0.020113738253712654, | |
| "step": 26, | |
| "step_time": 400.11353040200447 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6953.0, | |
| "completions/max_terminated_length": 6953.0, | |
| "completions/mean_length": 6171.40625, | |
| "completions/mean_terminated_length": 6171.40625, | |
| "completions/min_length": 3641.0, | |
| "completions/min_terminated_length": 3641.0, | |
| "entropy": 0.31149382051080465, | |
| "epoch": 0.216, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6737591028213501, | |
| "kl": 0.005847332264238503, | |
| "learning_rate": 2.3015669160389767e-05, | |
| "loss": -0.2064, | |
| "num_tokens": 6319100.0, | |
| "reward": 0.09456250071525574, | |
| "reward_std": 0.29294320940971375, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.09456250071525574, | |
| "rewards/alfworld_rollout_reward_func/std": 0.41390296816825867, | |
| "sampling/importance_sampling_ratio/max": 2.599041223526001, | |
| "sampling/importance_sampling_ratio/mean": 0.5393513441085815, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.398754596710205, | |
| "sampling/sampling_logp_difference/mean": 0.020149247720837593, | |
| "step": 27, | |
| "step_time": 238.93764413800636 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4903.0, | |
| "completions/max_terminated_length": 4903.0, | |
| "completions/mean_length": 2781.09375, | |
| "completions/mean_terminated_length": 2781.09375, | |
| "completions/min_length": 481.0, | |
| "completions/min_terminated_length": 481.0, | |
| "entropy": 0.3561269377823919, | |
| "epoch": 0.224, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3367806077003479, | |
| "kl": 0.006373407544742804, | |
| "learning_rate": 2.283668261820161e-05, | |
| "loss": -0.4613, | |
| "num_tokens": 6423071.0, | |
| "reward": 0.6257500052452087, | |
| "reward_std": 0.43230414390563965, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6257500052452087, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5019095540046692, | |
| "sampling/importance_sampling_ratio/max": 2.960655927658081, | |
| "sampling/importance_sampling_ratio/mean": 0.511298656463623, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.4176197052001953, | |
| "sampling/sampling_logp_difference/mean": 0.020958153530955315, | |
| "step": 28, | |
| "step_time": 165.59535311200307 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9912.0, | |
| "completions/max_terminated_length": 9912.0, | |
| "completions/mean_length": 8714.34375, | |
| "completions/mean_terminated_length": 8714.34375, | |
| "completions/min_length": 3333.0, | |
| "completions/min_terminated_length": 3333.0, | |
| "entropy": 0.3266249899752438, | |
| "epoch": 0.232, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4753160774707794, | |
| "kl": 0.004642268795578275, | |
| "learning_rate": 2.2650728434635627e-05, | |
| "loss": -0.2244, | |
| "num_tokens": 6727242.0, | |
| "reward": -0.024625001475214958, | |
| "reward_std": 0.15632086992263794, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.024625001475214958, | |
| "rewards/alfworld_rollout_reward_func/std": 0.2652129530906677, | |
| "sampling/importance_sampling_ratio/max": 2.4731619358062744, | |
| "sampling/importance_sampling_ratio/mean": 0.6834967732429504, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7130157947540283, | |
| "sampling/sampling_logp_difference/mean": 0.018951794132590294, | |
| "step": 29, | |
| "step_time": 380.26207640899884 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9703.0, | |
| "completions/max_terminated_length": 9703.0, | |
| "completions/mean_length": 5901.9375, | |
| "completions/mean_terminated_length": 5901.9375, | |
| "completions/min_length": 1826.0, | |
| "completions/min_terminated_length": 1826.0, | |
| "entropy": 0.3159148655831814, | |
| "epoch": 0.24, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.30811336636543274, | |
| "kl": 0.01128493210853776, | |
| "learning_rate": 2.245793195571545e-05, | |
| "loss": 0.2382, | |
| "num_tokens": 6941480.0, | |
| "reward": 0.5379999876022339, | |
| "reward_std": 0.529421329498291, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5379999876022339, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5327406525611877, | |
| "sampling/importance_sampling_ratio/max": 2.1434435844421387, | |
| "sampling/importance_sampling_ratio/mean": 0.42453715205192566, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 55.80344009399414, | |
| "sampling/sampling_logp_difference/mean": 0.030401449650526047, | |
| "step": 30, | |
| "step_time": 299.59452940400297 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8485.0, | |
| "completions/max_terminated_length": 8485.0, | |
| "completions/mean_length": 6588.875, | |
| "completions/mean_terminated_length": 6588.875, | |
| "completions/min_length": 1445.0, | |
| "completions/min_terminated_length": 1445.0, | |
| "entropy": 0.24451400735415518, | |
| "epoch": 0.248, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3296390771865845, | |
| "kl": 0.005532389182917541, | |
| "learning_rate": 2.22584231396466e-05, | |
| "loss": -0.1623, | |
| "num_tokens": 7173284.0, | |
| "reward": 0.10481249541044235, | |
| "reward_std": 0.294689416885376, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.10481249541044235, | |
| "rewards/alfworld_rollout_reward_func/std": 0.41431817412376404, | |
| "sampling/importance_sampling_ratio/max": 2.418684959411621, | |
| "sampling/importance_sampling_ratio/mean": 0.5402201414108276, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.55106782913208, | |
| "sampling/sampling_logp_difference/mean": 0.014965346083045006, | |
| "step": 31, | |
| "step_time": 294.43344296000396 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 13133.0, | |
| "completions/max_terminated_length": 13133.0, | |
| "completions/mean_length": 7847.59375, | |
| "completions/mean_terminated_length": 7847.59375, | |
| "completions/min_length": 2243.0, | |
| "completions/min_terminated_length": 2243.0, | |
| "entropy": 0.2889693870674819, | |
| "epoch": 0.256, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5536743402481079, | |
| "kl": 0.007658715800062055, | |
| "learning_rate": 2.2052336469215616e-05, | |
| "loss": 0.6745, | |
| "num_tokens": 7456151.0, | |
| "reward": 0.47606247663497925, | |
| "reward_std": 0.5644147396087646, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.47606247663497925, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5439455509185791, | |
| "sampling/importance_sampling_ratio/max": 2.5001707077026367, | |
| "sampling/importance_sampling_ratio/mean": 0.7739982008934021, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.0981035232543945, | |
| "sampling/sampling_logp_difference/mean": 0.020820828154683113, | |
| "step": 32, | |
| "step_time": 411.82160600200405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5437.0, | |
| "completions/max_terminated_length": 5437.0, | |
| "completions/mean_length": 4673.25, | |
| "completions/mean_terminated_length": 4673.25, | |
| "completions/min_length": 2119.0, | |
| "completions/min_terminated_length": 2119.0, | |
| "entropy": 0.2834106246009469, | |
| "epoch": 0.264, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4467843174934387, | |
| "kl": 0.010782435245346278, | |
| "learning_rate": 2.183981086113933e-05, | |
| "loss": 0.5872, | |
| "num_tokens": 7623039.0, | |
| "reward": 0.0091249980032444, | |
| "reward_std": 0.2119598239660263, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.0091249980032444, | |
| "rewards/alfworld_rollout_reward_func/std": 0.31305113434791565, | |
| "sampling/importance_sampling_ratio/max": 2.066004514694214, | |
| "sampling/importance_sampling_ratio/mean": 0.5565071105957031, | |
| "sampling/importance_sampling_ratio/min": 3.582804466278037e-30, | |
| "sampling/sampling_logp_difference/max": 58.839195251464844, | |
| "sampling/sampling_logp_difference/mean": 0.025124380365014076, | |
| "step": 33, | |
| "step_time": 180.57811164100167 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9775.0, | |
| "completions/max_terminated_length": 9775.0, | |
| "completions/mean_length": 7425.125, | |
| "completions/mean_terminated_length": 7425.125, | |
| "completions/min_length": 2879.0, | |
| "completions/min_terminated_length": 2879.0, | |
| "entropy": 0.34114605700597167, | |
| "epoch": 0.272, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4392341375350952, | |
| "kl": 0.012745806110615376, | |
| "learning_rate": 2.1620989572425376e-05, | |
| "loss": 0.0367, | |
| "num_tokens": 7886531.0, | |
| "reward": 0.29856249690055847, | |
| "reward_std": 0.5038343667984009, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.29856249690055847, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5456701517105103, | |
| "sampling/importance_sampling_ratio/max": 2.859746217727661, | |
| "sampling/importance_sampling_ratio/mean": 0.40094250440597534, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6913090944290161, | |
| "sampling/sampling_logp_difference/mean": 0.02110915444791317, | |
| "step": 34, | |
| "step_time": 348.9884387079983 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9581.0, | |
| "completions/max_terminated_length": 9581.0, | |
| "completions/mean_length": 6927.1875, | |
| "completions/mean_terminated_length": 6927.1875, | |
| "completions/min_length": 2349.0, | |
| "completions/min_terminated_length": 2349.0, | |
| "entropy": 0.3079565931111574, | |
| "epoch": 0.28, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7179110050201416, | |
| "kl": 0.0068448090823949315, | |
| "learning_rate": 2.1396020103807003e-05, | |
| "loss": 1.2054, | |
| "num_tokens": 8133961.0, | |
| "reward": 0.30643752217292786, | |
| "reward_std": 0.5345775485038757, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.30643752217292786, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5518923997879028, | |
| "sampling/importance_sampling_ratio/max": 2.6090545654296875, | |
| "sampling/importance_sampling_ratio/mean": 0.5262423157691956, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2512085437774658, | |
| "sampling/sampling_logp_difference/mean": 0.019684435799717903, | |
| "step": 35, | |
| "step_time": 332.1077152680009 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6221.0, | |
| "completions/max_terminated_length": 6221.0, | |
| "completions/mean_length": 5477.1875, | |
| "completions/mean_terminated_length": 5477.1875, | |
| "completions/min_length": 3469.0, | |
| "completions/min_terminated_length": 3469.0, | |
| "entropy": 0.28742302511818707, | |
| "epoch": 0.288, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3029775023460388, | |
| "kl": 0.007389042439172044, | |
| "learning_rate": 2.1165054100317364e-05, | |
| "loss": -0.0729, | |
| "num_tokens": 8328591.0, | |
| "reward": -0.043937504291534424, | |
| "reward_std": 0.08157414942979813, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.043937504291534424, | |
| "rewards/alfworld_rollout_reward_func/std": 0.184916689991951, | |
| "sampling/importance_sampling_ratio/max": 2.357729911804199, | |
| "sampling/importance_sampling_ratio/mean": 0.4288977086544037, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.2753419876098633, | |
| "sampling/sampling_logp_difference/mean": 0.017678607255220413, | |
| "step": 36, | |
| "step_time": 228.19555244999538 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10279.0, | |
| "completions/max_terminated_length": 10279.0, | |
| "completions/mean_length": 8721.09375, | |
| "completions/mean_terminated_length": 8721.09375, | |
| "completions/min_length": 2359.0, | |
| "completions/min_terminated_length": 2359.0, | |
| "entropy": 0.3117677140980959, | |
| "epoch": 0.296, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4540128707885742, | |
| "kl": 0.006687425287964288, | |
| "learning_rate": 2.0928247249070227e-05, | |
| "loss": 0.323, | |
| "num_tokens": 8633362.0, | |
| "reward": -0.011625003069639206, | |
| "reward_std": 0.16723808646202087, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.011625003069639206, | |
| "rewards/alfworld_rollout_reward_func/std": 0.32382330298423767, | |
| "sampling/importance_sampling_ratio/max": 2.514087438583374, | |
| "sampling/importance_sampling_ratio/mean": 0.3541397750377655, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.362422466278076, | |
| "sampling/sampling_logp_difference/mean": 0.020557893440127373, | |
| "step": 37, | |
| "step_time": 371.70526020600664 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5569.0, | |
| "completions/max_terminated_length": 5569.0, | |
| "completions/mean_length": 4266.03125, | |
| "completions/mean_terminated_length": 4266.03125, | |
| "completions/min_length": 1452.0, | |
| "completions/min_terminated_length": 1452.0, | |
| "entropy": 0.39905168302357197, | |
| "epoch": 0.304, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6425251960754395, | |
| "kl": 0.007170766235503834, | |
| "learning_rate": 2.0685759174316066e-05, | |
| "loss": 0.3563, | |
| "num_tokens": 8786803.0, | |
| "reward": 0.31306248903274536, | |
| "reward_std": 0.5542499423027039, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.31306248903274536, | |
| "rewards/alfworld_rollout_reward_func/std": 0.528755784034729, | |
| "sampling/importance_sampling_ratio/max": 2.680379629135132, | |
| "sampling/importance_sampling_ratio/mean": 0.4211962819099426, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3480374813079834, | |
| "sampling/sampling_logp_difference/mean": 0.02187519334256649, | |
| "step": 38, | |
| "step_time": 221.66309013099817 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6825.0, | |
| "completions/max_terminated_length": 6825.0, | |
| "completions/mean_length": 4017.1875, | |
| "completions/mean_terminated_length": 4017.1875, | |
| "completions/min_length": 1882.0, | |
| "completions/min_terminated_length": 1882.0, | |
| "entropy": 0.27494299272075295, | |
| "epoch": 0.312, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4104083776473999, | |
| "kl": 0.008522690397512633, | |
| "learning_rate": 2.0437753329844232e-05, | |
| "loss": 0.3644, | |
| "num_tokens": 8935129.0, | |
| "reward": 0.5581250190734863, | |
| "reward_std": 0.5189322829246521, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5581250190734863, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5356204509735107, | |
| "sampling/importance_sampling_ratio/max": 1.939102292060852, | |
| "sampling/importance_sampling_ratio/mean": 0.6145648956298828, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6772491931915283, | |
| "sampling/sampling_logp_difference/mean": 0.01894262433052063, | |
| "step": 39, | |
| "step_time": 210.97975667200626 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5856.0, | |
| "completions/max_terminated_length": 5856.0, | |
| "completions/mean_length": 5156.53125, | |
| "completions/mean_terminated_length": 5156.53125, | |
| "completions/min_length": 4694.0, | |
| "completions/min_terminated_length": 4694.0, | |
| "entropy": 0.29539695545099676, | |
| "epoch": 0.32, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5827170014381409, | |
| "kl": 0.01248152791231405, | |
| "learning_rate": 2.0184396888803762e-05, | |
| "loss": -0.077, | |
| "num_tokens": 9118090.0, | |
| "reward": -0.09750000387430191, | |
| "reward_std": 0.0399014875292778, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.09750000387430191, | |
| "rewards/alfworld_rollout_reward_func/std": 0.042804885655641556, | |
| "sampling/importance_sampling_ratio/max": 1.8288264274597168, | |
| "sampling/importance_sampling_ratio/mean": 0.4271472096443176, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 53.8599967956543, | |
| "sampling/sampling_logp_difference/mean": 0.03358942270278931, | |
| "step": 40, | |
| "step_time": 203.72415082099906 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5133.0, | |
| "completions/max_terminated_length": 5133.0, | |
| "completions/mean_length": 2081.125, | |
| "completions/mean_terminated_length": 2081.125, | |
| "completions/min_length": 857.0, | |
| "completions/min_terminated_length": 857.0, | |
| "entropy": 0.3698639366775751, | |
| "epoch": 0.328, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.39960983395576477, | |
| "kl": 0.01080034705955768, | |
| "learning_rate": 1.9925860631017078e-05, | |
| "loss": -0.0291, | |
| "num_tokens": 9202190.0, | |
| "reward": 0.8761249780654907, | |
| "reward_std": 0.17561057209968567, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.8761249780654907, | |
| "rewards/alfworld_rollout_reward_func/std": 0.2839300036430359, | |
| "sampling/importance_sampling_ratio/max": 2.4484190940856934, | |
| "sampling/importance_sampling_ratio/mean": 0.6725524663925171, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6942663192749023, | |
| "sampling/sampling_logp_difference/mean": 0.02107181027531624, | |
| "step": 41, | |
| "step_time": 132.85459781399732 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10435.0, | |
| "completions/max_terminated_length": 10435.0, | |
| "completions/mean_length": 9668.15625, | |
| "completions/mean_terminated_length": 9668.15625, | |
| "completions/min_length": 7131.0, | |
| "completions/min_terminated_length": 7131.0, | |
| "entropy": 0.28246624674648046, | |
| "epoch": 0.336, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.488521009683609, | |
| "kl": 0.005378451882279478, | |
| "learning_rate": 1.9662318827862527e-05, | |
| "loss": -0.6846, | |
| "num_tokens": 9538611.0, | |
| "reward": 0.004812499508261681, | |
| "reward_std": 0.16284173727035522, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.004812499508261681, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3033957779407501, | |
| "sampling/importance_sampling_ratio/max": 1.696621298789978, | |
| "sampling/importance_sampling_ratio/mean": 0.5417707562446594, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.047061562538147, | |
| "sampling/sampling_logp_difference/mean": 0.01622496359050274, | |
| "step": 42, | |
| "step_time": 423.1995726519999 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5896.0, | |
| "completions/max_terminated_length": 5896.0, | |
| "completions/mean_length": 3412.84375, | |
| "completions/mean_terminated_length": 3412.84375, | |
| "completions/min_length": 777.0, | |
| "completions/min_terminated_length": 777.0, | |
| "entropy": 0.45027640648186207, | |
| "epoch": 0.344, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7471272945404053, | |
| "kl": 0.010089495350257494, | |
| "learning_rate": 1.9393949124803384e-05, | |
| "loss": 0.2412, | |
| "num_tokens": 9663502.0, | |
| "reward": 0.44075000286102295, | |
| "reward_std": 0.5147863626480103, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.44075000286102295, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5600635409355164, | |
| "sampling/importance_sampling_ratio/max": 2.6056442260742188, | |
| "sampling/importance_sampling_ratio/mean": 0.4545601010322571, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2315409183502197, | |
| "sampling/sampling_logp_difference/mean": 0.023922625929117203, | |
| "step": 43, | |
| "step_time": 203.09121891599898 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8859.0, | |
| "completions/max_terminated_length": 8859.0, | |
| "completions/mean_length": 6845.78125, | |
| "completions/mean_terminated_length": 6845.78125, | |
| "completions/min_length": 1717.0, | |
| "completions/min_terminated_length": 1717.0, | |
| "entropy": 0.32285092724487185, | |
| "epoch": 0.352, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4024052023887634, | |
| "kl": 0.008475492584693711, | |
| "learning_rate": 1.9120932421642484e-05, | |
| "loss": 0.5293, | |
| "num_tokens": 9905671.0, | |
| "reward": 0.13581249117851257, | |
| "reward_std": 0.2240503877401352, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.13581249117851257, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4513090252876282, | |
| "sampling/importance_sampling_ratio/max": 2.646895170211792, | |
| "sampling/importance_sampling_ratio/mean": 0.4667580723762512, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5646392107009888, | |
| "sampling/sampling_logp_difference/mean": 0.019861401990056038, | |
| "step": 44, | |
| "step_time": 315.9145935930028 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9315.0, | |
| "completions/max_terminated_length": 9315.0, | |
| "completions/mean_length": 8032.53125, | |
| "completions/mean_terminated_length": 8032.53125, | |
| "completions/min_length": 4993.0, | |
| "completions/min_terminated_length": 4993.0, | |
| "entropy": 0.29978093737736344, | |
| "epoch": 0.36, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3842543065547943, | |
| "kl": 0.006619712650717702, | |
| "learning_rate": 1.8843452750583195e-05, | |
| "loss": -0.05, | |
| "num_tokens": 10185592.0, | |
| "reward": 0.03681249916553497, | |
| "reward_std": 0.26903754472732544, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.03681249916553497, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3464287221431732, | |
| "sampling/importance_sampling_ratio/max": 2.4647469520568848, | |
| "sampling/importance_sampling_ratio/mean": 0.3931768536567688, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.668698310852051, | |
| "sampling/sampling_logp_difference/mean": 0.019018925726413727, | |
| "step": 45, | |
| "step_time": 344.13413975400545 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6213.0, | |
| "completions/max_terminated_length": 6213.0, | |
| "completions/mean_length": 5737.6875, | |
| "completions/mean_terminated_length": 5737.6875, | |
| "completions/min_length": 3759.0, | |
| "completions/min_terminated_length": 3759.0, | |
| "entropy": 0.3290684539824724, | |
| "epoch": 0.368, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6180854439735413, | |
| "kl": 0.024709066594368778, | |
| "learning_rate": 1.856169715217896e-05, | |
| "loss": 0.3411, | |
| "num_tokens": 10387950.0, | |
| "reward": -0.06937500089406967, | |
| "reward_std": 0.16165806353092194, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.06937500089406967, | |
| "rewards/alfworld_rollout_reward_func/std": 0.2620733380317688, | |
| "sampling/importance_sampling_ratio/max": 1.737358808517456, | |
| "sampling/importance_sampling_ratio/mean": 0.3259963095188141, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.5090951919555664, | |
| "sampling/sampling_logp_difference/mean": 0.019301004707813263, | |
| "step": 46, | |
| "step_time": 242.2902739599922 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10361.0, | |
| "completions/max_terminated_length": 10361.0, | |
| "completions/mean_length": 8327.21875, | |
| "completions/mean_terminated_length": 8327.21875, | |
| "completions/min_length": 2893.0, | |
| "completions/min_terminated_length": 2893.0, | |
| "entropy": 0.3274059356190264, | |
| "epoch": 0.376, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4274188280105591, | |
| "kl": 0.010049835465906654, | |
| "learning_rate": 1.8275855549254953e-05, | |
| "loss": -0.2629, | |
| "num_tokens": 10679093.0, | |
| "reward": 0.08993750810623169, | |
| "reward_std": 0.35606837272644043, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.08993750810623169, | |
| "rewards/alfworld_rollout_reward_func/std": 0.42536285519599915, | |
| "sampling/importance_sampling_ratio/max": 2.3043253421783447, | |
| "sampling/importance_sampling_ratio/mean": 0.4068600535392761, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6962127685546875, | |
| "sampling/sampling_logp_difference/mean": 0.020758148282766342, | |
| "step": 47, | |
| "step_time": 370.7235914090161 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5495.0, | |
| "completions/max_terminated_length": 5495.0, | |
| "completions/mean_length": 3065.90625, | |
| "completions/mean_terminated_length": 3065.90625, | |
| "completions/min_length": 579.0, | |
| "completions/min_terminated_length": 579.0, | |
| "entropy": 0.3348121759481728, | |
| "epoch": 0.384, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2784886062145233, | |
| "kl": 0.009557744706398807, | |
| "learning_rate": 1.798612061888695e-05, | |
| "loss": -0.0538, | |
| "num_tokens": 10794834.0, | |
| "reward": 0.6332499980926514, | |
| "reward_std": 0.5017424821853638, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6332499980926514, | |
| "rewards/alfworld_rollout_reward_func/std": 0.49993231892585754, | |
| "sampling/importance_sampling_ratio/max": 1.7117670774459839, | |
| "sampling/importance_sampling_ratio/mean": 0.546383261680603, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.1129664182662964, | |
| "sampling/sampling_logp_difference/mean": 0.02091594971716404, | |
| "step": 48, | |
| "step_time": 173.00113746399438 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4629.0, | |
| "completions/max_terminated_length": 4629.0, | |
| "completions/mean_length": 1015.0, | |
| "completions/mean_terminated_length": 1015.0, | |
| "completions/min_length": 505.0, | |
| "completions/min_terminated_length": 505.0, | |
| "entropy": 0.3150682970881462, | |
| "epoch": 0.392, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.39293837547302246, | |
| "kl": 0.01074478572991211, | |
| "learning_rate": 1.7692687662523583e-05, | |
| "loss": 0.3488, | |
| "num_tokens": 10841970.0, | |
| "reward": 0.9421250224113464, | |
| "reward_std": 0.08922727406024933, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.9421250224113464, | |
| "rewards/alfworld_rollout_reward_func/std": 0.20758825540542603, | |
| "sampling/importance_sampling_ratio/max": 2.490204095840454, | |
| "sampling/importance_sampling_ratio/mean": 0.959223210811615, | |
| "sampling/importance_sampling_ratio/min": 0.02937433123588562, | |
| "sampling/sampling_logp_difference/max": 1.6584293842315674, | |
| "sampling/sampling_logp_difference/mean": 0.020461907610297203, | |
| "step": 49, | |
| "step_time": 92.39997174798918 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7372.0, | |
| "completions/max_terminated_length": 7372.0, | |
| "completions/mean_length": 6640.0, | |
| "completions/mean_terminated_length": 6640.0, | |
| "completions/min_length": 6025.0, | |
| "completions/min_terminated_length": 6025.0, | |
| "entropy": 0.3124278010800481, | |
| "epoch": 0.4, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25588029623031616, | |
| "kl": 0.010321544992621057, | |
| "learning_rate": 1.739575447433963e-05, | |
| "loss": -0.0403, | |
| "num_tokens": 11075154.0, | |
| "reward": -0.08562500029802322, | |
| "reward_std": 0.03361169621348381, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.08562500029802322, | |
| "rewards/alfworld_rollout_reward_func/std": 0.04079354181885719, | |
| "sampling/importance_sampling_ratio/max": 2.596904754638672, | |
| "sampling/importance_sampling_ratio/mean": 0.33153465390205383, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 57.544639587402344, | |
| "sampling/sampling_logp_difference/mean": 0.02437865547835827, | |
| "step": 50, | |
| "step_time": 265.4482773190066 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10133.0, | |
| "completions/max_terminated_length": 10133.0, | |
| "completions/mean_length": 7551.125, | |
| "completions/mean_terminated_length": 7551.125, | |
| "completions/min_length": 1574.0, | |
| "completions/min_terminated_length": 1574.0, | |
| "entropy": 0.3586070057936013, | |
| "epoch": 0.408, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5439701676368713, | |
| "kl": 0.009389235550770536, | |
| "learning_rate": 1.7095521207909e-05, | |
| "loss": 0.1081, | |
| "num_tokens": 11339414.0, | |
| "reward": 0.03581250086426735, | |
| "reward_std": 0.29458412528038025, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.03581250086426735, | |
| "rewards/alfworld_rollout_reward_func/std": 0.39730069041252136, | |
| "sampling/importance_sampling_ratio/max": 2.3939778804779053, | |
| "sampling/importance_sampling_ratio/mean": 0.46745312213897705, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.9706189632415771, | |
| "sampling/sampling_logp_difference/mean": 0.02153097279369831, | |
| "step": 51, | |
| "step_time": 354.3591809840109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9795.0, | |
| "completions/max_terminated_length": 9795.0, | |
| "completions/mean_length": 8446.46875, | |
| "completions/mean_terminated_length": 8446.46875, | |
| "completions/min_length": 3995.0, | |
| "completions/min_terminated_length": 3995.0, | |
| "entropy": 0.36232828767970204, | |
| "epoch": 0.416, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5382326245307922, | |
| "kl": 0.011482410060125403, | |
| "learning_rate": 1.6792190241287358e-05, | |
| "loss": 0.7703, | |
| "num_tokens": 11634213.0, | |
| "reward": 0.05806249752640724, | |
| "reward_std": 0.2869499921798706, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.05806249752640724, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3851432800292969, | |
| "sampling/importance_sampling_ratio/max": 1.6283434629440308, | |
| "sampling/importance_sampling_ratio/mean": 0.35767966508865356, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 50.320884704589844, | |
| "sampling/sampling_logp_difference/mean": 0.028291059657931328, | |
| "step": 52, | |
| "step_time": 381.02555549900353 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5540.0, | |
| "completions/max_terminated_length": 5540.0, | |
| "completions/mean_length": 4920.09375, | |
| "completions/mean_terminated_length": 4920.09375, | |
| "completions/min_length": 3916.0, | |
| "completions/min_terminated_length": 3916.0, | |
| "entropy": 0.33526887465268373, | |
| "epoch": 0.424, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.49836915731430054, | |
| "kl": 0.006760991345799994, | |
| "learning_rate": 1.6485966040595234e-05, | |
| "loss": 0.2884, | |
| "num_tokens": 11809064.0, | |
| "reward": 0.041937507688999176, | |
| "reward_std": 0.2676677107810974, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.041937507688999176, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3419354557991028, | |
| "sampling/importance_sampling_ratio/max": 2.4798102378845215, | |
| "sampling/importance_sampling_ratio/mean": 0.39536625146865845, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.4147335290908813, | |
| "sampling/sampling_logp_difference/mean": 0.020684119313955307, | |
| "step": 53, | |
| "step_time": 234.1235085829867 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5491.0, | |
| "completions/max_terminated_length": 5491.0, | |
| "completions/mean_length": 4343.40625, | |
| "completions/mean_terminated_length": 4343.40625, | |
| "completions/min_length": 2950.0, | |
| "completions/min_terminated_length": 2950.0, | |
| "entropy": 0.3150371379451826, | |
| "epoch": 0.432, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3623775541782379, | |
| "kl": 0.016882637079106644, | |
| "learning_rate": 1.6177055022193705e-05, | |
| "loss": -0.0492, | |
| "num_tokens": 11964309.0, | |
| "reward": 0.09349999576807022, | |
| "reward_std": 0.2926024794578552, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.09349999576807022, | |
| "rewards/alfworld_rollout_reward_func/std": 0.41188251972198486, | |
| "sampling/importance_sampling_ratio/max": 2.349982261657715, | |
| "sampling/importance_sampling_ratio/mean": 0.3969458341598511, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.997692108154297, | |
| "sampling/sampling_logp_difference/mean": 0.02303631231188774, | |
| "step": 54, | |
| "step_time": 198.36046067398638 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9719.0, | |
| "completions/max_terminated_length": 9719.0, | |
| "completions/mean_length": 7502.34375, | |
| "completions/mean_terminated_length": 7502.34375, | |
| "completions/min_length": 1859.0, | |
| "completions/min_terminated_length": 1859.0, | |
| "entropy": 0.30430709826759994, | |
| "epoch": 0.44, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.44352656602859497, | |
| "kl": 0.0072795703235897236, | |
| "learning_rate": 1.5865665413545433e-05, | |
| "loss": 0.3331, | |
| "num_tokens": 12230304.0, | |
| "reward": 0.2774375081062317, | |
| "reward_std": 0.5174306035041809, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.2774375081062317, | |
| "rewards/alfworld_rollout_reward_func/std": 0.537744402885437, | |
| "sampling/importance_sampling_ratio/max": 2.195627212524414, | |
| "sampling/importance_sampling_ratio/mean": 0.3545913100242615, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.711559534072876, | |
| "sampling/sampling_logp_difference/mean": 0.01915351301431656, | |
| "step": 55, | |
| "step_time": 336.9539677490029 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5623.0, | |
| "completions/max_terminated_length": 5623.0, | |
| "completions/mean_length": 3929.40625, | |
| "completions/mean_terminated_length": 3929.40625, | |
| "completions/min_length": 696.0, | |
| "completions/min_terminated_length": 696.0, | |
| "entropy": 0.3456273628398776, | |
| "epoch": 0.448, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2932792007923126, | |
| "kl": 0.011585585234570317, | |
| "learning_rate": 1.5552007112854894e-05, | |
| "loss": 0.2115, | |
| "num_tokens": 12373933.0, | |
| "reward": 0.5378749966621399, | |
| "reward_std": 0.5084604024887085, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5378749966621399, | |
| "rewards/alfworld_rollout_reward_func/std": 0.535780131816864, | |
| "sampling/importance_sampling_ratio/max": 2.6537976264953613, | |
| "sampling/importance_sampling_ratio/mean": 0.4917897582054138, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6209819316864014, | |
| "sampling/sampling_logp_difference/mean": 0.02049492858350277, | |
| "step": 56, | |
| "step_time": 196.3063545789919 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7242.0, | |
| "completions/max_terminated_length": 7242.0, | |
| "completions/mean_length": 6137.5, | |
| "completions/mean_terminated_length": 6137.5, | |
| "completions/min_length": 1471.0, | |
| "completions/min_terminated_length": 1471.0, | |
| "entropy": 0.2984403392765671, | |
| "epoch": 0.456, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.34220486879348755, | |
| "kl": 0.006747693880242878, | |
| "learning_rate": 1.5236291547582437e-05, | |
| "loss": 0.3575, | |
| "num_tokens": 12591325.0, | |
| "reward": 0.14399999380111694, | |
| "reward_std": 0.29427629709243774, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.14399999380111694, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4284597933292389, | |
| "sampling/importance_sampling_ratio/max": 2.0604751110076904, | |
| "sampling/importance_sampling_ratio/mean": 0.42357033491134644, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.4621469974517822, | |
| "sampling/sampling_logp_difference/mean": 0.01661795936524868, | |
| "step": 57, | |
| "step_time": 275.63650079599756 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9986.0, | |
| "completions/max_terminated_length": 9986.0, | |
| "completions/mean_length": 8814.6875, | |
| "completions/mean_terminated_length": 8814.6875, | |
| "completions/min_length": 2337.0, | |
| "completions/min_terminated_length": 2337.0, | |
| "entropy": 0.3586568986065686, | |
| "epoch": 0.464, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.47871214151382446, | |
| "kl": 0.007779015490086749, | |
| "learning_rate": 1.4918731531927497e-05, | |
| "loss": -0.5934, | |
| "num_tokens": 12899251.0, | |
| "reward": 0.044374994933605194, | |
| "reward_std": 0.3030627369880676, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.044374994933605194, | |
| "rewards/alfworld_rollout_reward_func/std": 0.39909231662750244, | |
| "sampling/importance_sampling_ratio/max": 2.446885347366333, | |
| "sampling/importance_sampling_ratio/mean": 0.42126935720443726, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7695283889770508, | |
| "sampling/sampling_logp_difference/mean": 0.02191038988530636, | |
| "step": 58, | |
| "step_time": 390.69952411300983 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7052.0, | |
| "completions/max_terminated_length": 7052.0, | |
| "completions/mean_length": 5947.40625, | |
| "completions/mean_terminated_length": 5947.40625, | |
| "completions/min_length": 2393.0, | |
| "completions/min_terminated_length": 2393.0, | |
| "entropy": 0.3142848704010248, | |
| "epoch": 0.472, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5331009030342102, | |
| "kl": 0.012195366049127188, | |
| "learning_rate": 1.4599541123377061e-05, | |
| "loss": -0.4684, | |
| "num_tokens": 13110208.0, | |
| "reward": 0.07756249606609344, | |
| "reward_std": 0.3419586420059204, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.07756249606609344, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3887258768081665, | |
| "sampling/importance_sampling_ratio/max": 2.6775338649749756, | |
| "sampling/importance_sampling_ratio/mean": 0.4351937770843506, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.95979118347168, | |
| "sampling/sampling_logp_difference/mean": 0.020216144621372223, | |
| "step": 59, | |
| "step_time": 243.03599319599743 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9655.0, | |
| "completions/max_terminated_length": 9655.0, | |
| "completions/mean_length": 5772.5625, | |
| "completions/mean_terminated_length": 5772.5625, | |
| "completions/min_length": 1919.0, | |
| "completions/min_terminated_length": 1919.0, | |
| "entropy": 0.30983406328596175, | |
| "epoch": 0.48, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.53172367811203, | |
| "kl": 0.010136082419194281, | |
| "learning_rate": 1.4278935478416066e-05, | |
| "loss": 0.0168, | |
| "num_tokens": 13317810.0, | |
| "reward": 0.5102499723434448, | |
| "reward_std": 0.5633312463760376, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5102499723434448, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5470424294471741, | |
| "sampling/importance_sampling_ratio/max": 2.9595675468444824, | |
| "sampling/importance_sampling_ratio/mean": 0.6123539805412292, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.608839988708496, | |
| "sampling/sampling_logp_difference/mean": 0.020261507481336594, | |
| "step": 60, | |
| "step_time": 298.4086459939899 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10755.0, | |
| "completions/max_terminated_length": 10755.0, | |
| "completions/mean_length": 6594.71875, | |
| "completions/mean_terminated_length": 6594.71875, | |
| "completions/min_length": 3297.0, | |
| "completions/min_terminated_length": 3297.0, | |
| "entropy": 0.253853059373796, | |
| "epoch": 0.488, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6812533140182495, | |
| "kl": 0.006578363383596297, | |
| "learning_rate": 1.3957130707496991e-05, | |
| "loss": 0.6806, | |
| "num_tokens": 13549897.0, | |
| "reward": 0.29243749380111694, | |
| "reward_std": 0.47611552476882935, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.29243749380111694, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5179726481437683, | |
| "sampling/importance_sampling_ratio/max": 2.8079097270965576, | |
| "sampling/importance_sampling_ratio/mean": 0.6100484132766724, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.366208553314209, | |
| "sampling/sampling_logp_difference/mean": 0.01589721068739891, | |
| "step": 61, | |
| "step_time": 345.328052976005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6121.0, | |
| "completions/max_terminated_length": 6121.0, | |
| "completions/mean_length": 5639.4375, | |
| "completions/mean_terminated_length": 5639.4375, | |
| "completions/min_length": 1575.0, | |
| "completions/min_terminated_length": 1575.0, | |
| "entropy": 0.3537342040799558, | |
| "epoch": 0.496, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4658977687358856, | |
| "kl": 0.008820160976029001, | |
| "learning_rate": 1.363434372936643e-05, | |
| "loss": 0.0373, | |
| "num_tokens": 13749239.0, | |
| "reward": -0.04087500274181366, | |
| "reward_std": 0.07875211536884308, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.04087500274181366, | |
| "rewards/alfworld_rollout_reward_func/std": 0.18735504150390625, | |
| "sampling/importance_sampling_ratio/max": 2.8549065589904785, | |
| "sampling/importance_sampling_ratio/mean": 0.4742495119571686, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.172121286392212, | |
| "sampling/sampling_logp_difference/mean": 0.019000394269824028, | |
| "step": 62, | |
| "step_time": 261.6466159019983 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4729.0, | |
| "completions/max_terminated_length": 4729.0, | |
| "completions/mean_length": 3591.78125, | |
| "completions/mean_terminated_length": 3591.78125, | |
| "completions/min_length": 980.0, | |
| "completions/min_terminated_length": 980.0, | |
| "entropy": 0.39169206377118826, | |
| "epoch": 0.504, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.247410386800766, | |
| "kl": 0.008050664589973167, | |
| "learning_rate": 1.3310792124846788e-05, | |
| "loss": -0.2754, | |
| "num_tokens": 13878928.0, | |
| "reward": 0.16506250202655792, | |
| "reward_std": 0.45310160517692566, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.16506250202655792, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5017751455307007, | |
| "sampling/importance_sampling_ratio/max": 1.3678574562072754, | |
| "sampling/importance_sampling_ratio/mean": 0.2883392870426178, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.1917786598205566, | |
| "sampling/sampling_logp_difference/mean": 0.021689990535378456, | |
| "step": 63, | |
| "step_time": 211.52569950699763 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5346.0, | |
| "completions/max_terminated_length": 5346.0, | |
| "completions/mean_length": 4886.34375, | |
| "completions/mean_terminated_length": 4886.34375, | |
| "completions/min_length": 2880.0, | |
| "completions/min_terminated_length": 2880.0, | |
| "entropy": 0.3370038694702089, | |
| "epoch": 0.512, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6040080189704895, | |
| "kl": 0.007575266048661433, | |
| "learning_rate": 1.2986693990171722e-05, | |
| "loss": 1.0625, | |
| "num_tokens": 14052507.0, | |
| "reward": -0.017375001683831215, | |
| "reward_std": 0.17574599385261536, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.017375001683831215, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3123514950275421, | |
| "sampling/importance_sampling_ratio/max": 2.4749438762664795, | |
| "sampling/importance_sampling_ratio/mean": 0.4803212881088257, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2786328792572021, | |
| "sampling/sampling_logp_difference/mean": 0.018290970474481583, | |
| "step": 64, | |
| "step_time": 227.5868356469873 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5429.0, | |
| "completions/max_terminated_length": 5429.0, | |
| "completions/mean_length": 2972.3125, | |
| "completions/mean_terminated_length": 2972.3125, | |
| "completions/min_length": 539.0, | |
| "completions/min_terminated_length": 539.0, | |
| "entropy": 0.3263692925684154, | |
| "epoch": 0.52, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17109909653663635, | |
| "kl": 0.007112714993127156, | |
| "learning_rate": 1.2662267789974137e-05, | |
| "loss": -0.1672, | |
| "num_tokens": 14165157.0, | |
| "reward": 0.6520624756813049, | |
| "reward_std": 0.3671799898147583, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6520624756813049, | |
| "rewards/alfworld_rollout_reward_func/std": 0.49012401700019836, | |
| "sampling/importance_sampling_ratio/max": 1.6191086769104004, | |
| "sampling/importance_sampling_ratio/mean": 0.40486860275268555, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.247586965560913, | |
| "sampling/sampling_logp_difference/mean": 0.019062954932451248, | |
| "step": 65, | |
| "step_time": 169.92830283098374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10669.0, | |
| "completions/max_terminated_length": 10669.0, | |
| "completions/mean_length": 9819.1875, | |
| "completions/mean_terminated_length": 9819.1875, | |
| "completions/min_length": 5938.0, | |
| "completions/min_terminated_length": 5938.0, | |
| "entropy": 0.28255544137209654, | |
| "epoch": 0.528, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7858016490936279, | |
| "kl": 0.016726812566048466, | |
| "learning_rate": 1.2337732210025866e-05, | |
| "loss": 0.8719, | |
| "num_tokens": 14506315.0, | |
| "reward": -0.07387499511241913, | |
| "reward_std": 0.10663385689258575, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.07387499511241913, | |
| "rewards/alfworld_rollout_reward_func/std": 0.19619078934192657, | |
| "sampling/importance_sampling_ratio/max": 2.002222776412964, | |
| "sampling/importance_sampling_ratio/mean": 0.39136213064193726, | |
| "sampling/importance_sampling_ratio/min": 1.4739535799890291e-05, | |
| "sampling/sampling_logp_difference/max": 5.542607307434082, | |
| "sampling/sampling_logp_difference/mean": 0.01821236126124859, | |
| "step": 66, | |
| "step_time": 419.0694372189937 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10873.0, | |
| "completions/max_terminated_length": 10873.0, | |
| "completions/mean_length": 9183.96875, | |
| "completions/mean_terminated_length": 9183.96875, | |
| "completions/min_length": 3183.0, | |
| "completions/min_terminated_length": 3183.0, | |
| "entropy": 0.38195388251915574, | |
| "epoch": 0.536, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4079846143722534, | |
| "kl": 0.0140747545810882, | |
| "learning_rate": 1.2013306009828281e-05, | |
| "loss": 0.3006, | |
| "num_tokens": 14827306.0, | |
| "reward": 0.018437502905726433, | |
| "reward_std": 0.22722002863883972, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.018437502905726433, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3590969741344452, | |
| "sampling/importance_sampling_ratio/max": 2.1235499382019043, | |
| "sampling/importance_sampling_ratio/mean": 0.36683690547943115, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5769914388656616, | |
| "sampling/sampling_logp_difference/mean": 0.024590400978922844, | |
| "step": 67, | |
| "step_time": 386.19285881999167 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5260.0, | |
| "completions/max_terminated_length": 5260.0, | |
| "completions/mean_length": 4081.625, | |
| "completions/mean_terminated_length": 4081.625, | |
| "completions/min_length": 1937.0, | |
| "completions/min_terminated_length": 1937.0, | |
| "entropy": 0.35961280949413776, | |
| "epoch": 0.544, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5163018107414246, | |
| "kl": 0.011940339973079972, | |
| "learning_rate": 1.1689207875153212e-05, | |
| "loss": 0.474, | |
| "num_tokens": 14974590.0, | |
| "reward": 0.37187498807907104, | |
| "reward_std": 0.46666616201400757, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.37187498807907104, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5451789498329163, | |
| "sampling/importance_sampling_ratio/max": 2.092132568359375, | |
| "sampling/importance_sampling_ratio/mean": 0.5891081094741821, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.025409460067749, | |
| "sampling/sampling_logp_difference/mean": 0.022658195346593857, | |
| "step": 68, | |
| "step_time": 189.4174261460139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6060.0, | |
| "completions/max_terminated_length": 6060.0, | |
| "completions/mean_length": 1962.71875, | |
| "completions/mean_terminated_length": 1962.71875, | |
| "completions/min_length": 780.0, | |
| "completions/min_terminated_length": 780.0, | |
| "entropy": 0.24381280411034822, | |
| "epoch": 0.552, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.283759206533432, | |
| "kl": 0.028047901825630106, | |
| "learning_rate": 1.1365656270633572e-05, | |
| "loss": 0.1314, | |
| "num_tokens": 15056149.0, | |
| "reward": 0.8946250081062317, | |
| "reward_std": 0.16643308103084564, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.8946250081062317, | |
| "rewards/alfworld_rollout_reward_func/std": 0.28478240966796875, | |
| "sampling/importance_sampling_ratio/max": 1.3010145425796509, | |
| "sampling/importance_sampling_ratio/mean": 0.6231130361557007, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.1777536869049072, | |
| "sampling/sampling_logp_difference/mean": 0.01496939081698656, | |
| "step": 69, | |
| "step_time": 136.05892026298898 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8510.0, | |
| "completions/max_terminated_length": 8510.0, | |
| "completions/mean_length": 7672.125, | |
| "completions/mean_terminated_length": 7672.125, | |
| "completions/min_length": 7299.0, | |
| "completions/min_terminated_length": 7299.0, | |
| "entropy": 0.3060699696652591, | |
| "epoch": 0.56, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5419595837593079, | |
| "kl": 0.0077493647404480726, | |
| "learning_rate": 1.1042869292503012e-05, | |
| "loss": 0.5471, | |
| "num_tokens": 15324345.0, | |
| "reward": -0.08312499523162842, | |
| "reward_std": 0.02605646476149559, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.08312499523162842, | |
| "rewards/alfworld_rollout_reward_func/std": 0.0315653458237648, | |
| "sampling/importance_sampling_ratio/max": 1.9645739793777466, | |
| "sampling/importance_sampling_ratio/mean": 0.5267171859741211, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3863511085510254, | |
| "sampling/sampling_logp_difference/mean": 0.017259342595934868, | |
| "step": 70, | |
| "step_time": 321.4900386369991 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10572.0, | |
| "completions/max_terminated_length": 10572.0, | |
| "completions/mean_length": 9703.9375, | |
| "completions/mean_terminated_length": 9703.9375, | |
| "completions/min_length": 4990.0, | |
| "completions/min_terminated_length": 4990.0, | |
| "entropy": 0.3014857741072774, | |
| "epoch": 0.568, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6044378280639648, | |
| "kl": 0.010106177673151251, | |
| "learning_rate": 1.0721064521583937e-05, | |
| "loss": 0.0864, | |
| "num_tokens": 15662295.0, | |
| "reward": -0.03849999979138374, | |
| "reward_std": 0.14729847013950348, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.03849999979138374, | |
| "rewards/alfworld_rollout_reward_func/std": 0.26008138060569763, | |
| "sampling/importance_sampling_ratio/max": 2.1879913806915283, | |
| "sampling/importance_sampling_ratio/mean": 0.5329915285110474, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.4026594161987305, | |
| "sampling/sampling_logp_difference/mean": 0.017540952190756798, | |
| "step": 71, | |
| "step_time": 421.89684666199173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6152.0, | |
| "completions/max_terminated_length": 6152.0, | |
| "completions/mean_length": 5449.53125, | |
| "completions/mean_terminated_length": 5449.53125, | |
| "completions/min_length": 4948.0, | |
| "completions/min_terminated_length": 4948.0, | |
| "entropy": 0.3671146659180522, | |
| "epoch": 0.576, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5124484896659851, | |
| "kl": 0.009964851851691492, | |
| "learning_rate": 1.0400458876622939e-05, | |
| "loss": 0.4011, | |
| "num_tokens": 15854952.0, | |
| "reward": -0.0793749988079071, | |
| "reward_std": 0.020163455978035927, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.0793749988079071, | |
| "rewards/alfworld_rollout_reward_func/std": 0.021987900137901306, | |
| "sampling/importance_sampling_ratio/max": 1.6604212522506714, | |
| "sampling/importance_sampling_ratio/mean": 0.45693981647491455, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 50.06657791137695, | |
| "sampling/sampling_logp_difference/mean": 0.022638168185949326, | |
| "step": 72, | |
| "step_time": 250.9183966080127 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8627.0, | |
| "completions/max_terminated_length": 8627.0, | |
| "completions/mean_length": 7516.8125, | |
| "completions/mean_terminated_length": 7516.8125, | |
| "completions/min_length": 2413.0, | |
| "completions/min_terminated_length": 2413.0, | |
| "entropy": 0.30488129099830985, | |
| "epoch": 0.584, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.9326367378234863, | |
| "kl": 0.009634538357204292, | |
| "learning_rate": 1.0081268468072504e-05, | |
| "loss": -0.7301, | |
| "num_tokens": 16118786.0, | |
| "reward": 0.13762499392032623, | |
| "reward_std": 0.4009949862957001, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.13762499392032623, | |
| "rewards/alfworld_rollout_reward_func/std": 0.43177902698516846, | |
| "sampling/importance_sampling_ratio/max": 2.628365993499756, | |
| "sampling/importance_sampling_ratio/mean": 0.6898477077484131, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0378785133361816, | |
| "sampling/sampling_logp_difference/mean": 0.01770058088004589, | |
| "step": 73, | |
| "step_time": 324.3881754160029 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8399.0, | |
| "completions/max_terminated_length": 8399.0, | |
| "completions/mean_length": 4676.71875, | |
| "completions/mean_terminated_length": 4676.71875, | |
| "completions/min_length": 1857.0, | |
| "completions/min_terminated_length": 1857.0, | |
| "entropy": 0.3089015153236687, | |
| "epoch": 0.592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3285577893257141, | |
| "kl": 0.013235979298769962, | |
| "learning_rate": 9.763708452417566e-06, | |
| "loss": 0.3012, | |
| "num_tokens": 16291545.0, | |
| "reward": 0.6901249885559082, | |
| "reward_std": 0.35391491651535034, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6901249885559082, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4766641855239868, | |
| "sampling/importance_sampling_ratio/max": 1.9804767370224, | |
| "sampling/importance_sampling_ratio/mean": 0.5676183104515076, | |
| "sampling/importance_sampling_ratio/min": 1.8734867390329565e-35, | |
| "sampling/sampling_logp_difference/max": 52.56696319580078, | |
| "sampling/sampling_logp_difference/mean": 0.02483256347477436, | |
| "step": 74, | |
| "step_time": 270.81252180201045 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5990.0, | |
| "completions/max_terminated_length": 5990.0, | |
| "completions/mean_length": 2401.125, | |
| "completions/mean_terminated_length": 2401.125, | |
| "completions/min_length": 692.0, | |
| "completions/min_terminated_length": 692.0, | |
| "entropy": 0.2977066827006638, | |
| "epoch": 0.6, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.7037971615791321, | |
| "kl": 0.019976254399807658, | |
| "learning_rate": 9.44799288714511e-06, | |
| "loss": 0.9671, | |
| "num_tokens": 16387805.0, | |
| "reward": 0.8665000200271606, | |
| "reward_std": 0.16209571063518524, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.8665000200271606, | |
| "rewards/alfworld_rollout_reward_func/std": 0.31956785917282104, | |
| "sampling/importance_sampling_ratio/max": 2.9782536029815674, | |
| "sampling/importance_sampling_ratio/mean": 0.8318759202957153, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.887710452079773, | |
| "sampling/sampling_logp_difference/mean": 0.018892308697104454, | |
| "step": 75, | |
| "step_time": 166.59206653699948 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5156.0, | |
| "completions/max_terminated_length": 5156.0, | |
| "completions/mean_length": 3333.34375, | |
| "completions/mean_terminated_length": 3333.34375, | |
| "completions/min_length": 1359.0, | |
| "completions/min_terminated_length": 1359.0, | |
| "entropy": 0.3274269704706967, | |
| "epoch": 0.608, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4839501678943634, | |
| "kl": 0.01072185774683021, | |
| "learning_rate": 9.134334586454569e-06, | |
| "loss": 0.3904, | |
| "num_tokens": 16510632.0, | |
| "reward": 0.5139999985694885, | |
| "reward_std": 0.49413302540779114, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5139999985694885, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5383288264274597, | |
| "sampling/importance_sampling_ratio/max": 1.796036958694458, | |
| "sampling/importance_sampling_ratio/mean": 0.42258331179618835, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0271837711334229, | |
| "sampling/sampling_logp_difference/mean": 0.019824257120490074, | |
| "step": 76, | |
| "step_time": 179.70215874598216 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9625.0, | |
| "completions/max_terminated_length": 9625.0, | |
| "completions/mean_length": 8155.625, | |
| "completions/mean_terminated_length": 8155.625, | |
| "completions/min_length": 1912.0, | |
| "completions/min_terminated_length": 1912.0, | |
| "entropy": 0.3397471741773188, | |
| "epoch": 0.616, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26549482345581055, | |
| "kl": 0.01535673845501151, | |
| "learning_rate": 8.822944977806296e-06, | |
| "loss": -0.0306, | |
| "num_tokens": 16796252.0, | |
| "reward": 0.06531249731779099, | |
| "reward_std": 0.2820407748222351, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.06531249731779099, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3899073898792267, | |
| "sampling/importance_sampling_ratio/max": 2.449633836746216, | |
| "sampling/importance_sampling_ratio/mean": 0.22956383228302002, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 49.174781799316406, | |
| "sampling/sampling_logp_difference/mean": 0.025772254914045334, | |
| "step": 77, | |
| "step_time": 386.00914664800075 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9988.0, | |
| "completions/max_terminated_length": 9988.0, | |
| "completions/mean_length": 7138.4375, | |
| "completions/mean_terminated_length": 7138.4375, | |
| "completions/min_length": 1521.0, | |
| "completions/min_terminated_length": 1521.0, | |
| "entropy": 0.4407801004126668, | |
| "epoch": 0.624, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3279080390930176, | |
| "kl": 0.010239457304123789, | |
| "learning_rate": 8.514033959404768e-06, | |
| "loss": 0.3758, | |
| "num_tokens": 17044234.0, | |
| "reward": 0.0768750011920929, | |
| "reward_std": 0.36151063442230225, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.0768750011920929, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4254858195781708, | |
| "sampling/importance_sampling_ratio/max": 2.027116298675537, | |
| "sampling/importance_sampling_ratio/mean": 0.24855472147464752, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.5956943035125732, | |
| "sampling/sampling_logp_difference/mean": 0.025417163968086243, | |
| "step": 78, | |
| "step_time": 341.2563546900019 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 6459.0, | |
| "completions/max_terminated_length": 5516.0, | |
| "completions/mean_length": 4264.03125, | |
| "completions/mean_terminated_length": 4193.2255859375, | |
| "completions/min_length": 1240.0, | |
| "completions/min_terminated_length": 1240.0, | |
| "entropy": 0.35723056783899665, | |
| "epoch": 0.632, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5109708905220032, | |
| "kl": 0.008868186720064841, | |
| "learning_rate": 8.207809758712648e-06, | |
| "loss": -0.6946, | |
| "num_tokens": 17197227.0, | |
| "reward": 0.23375000059604645, | |
| "reward_std": 0.4359171390533447, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.23375000059604645, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5250879526138306, | |
| "sampling/importance_sampling_ratio/max": 2.4404332637786865, | |
| "sampling/importance_sampling_ratio/mean": 0.4912329316139221, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.107128143310547, | |
| "sampling/sampling_logp_difference/mean": 0.019834455102682114, | |
| "step": 79, | |
| "step_time": 239.72380435900413 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6371.0, | |
| "completions/max_terminated_length": 6371.0, | |
| "completions/mean_length": 5661.84375, | |
| "completions/mean_terminated_length": 5661.84375, | |
| "completions/min_length": 1258.0, | |
| "completions/min_terminated_length": 1258.0, | |
| "entropy": 0.276084772311151, | |
| "epoch": 0.64, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6164595484733582, | |
| "kl": 0.009095613342651632, | |
| "learning_rate": 7.904478792090999e-06, | |
| "loss": -1.3354, | |
| "num_tokens": 17397702.0, | |
| "reward": 0.052375003695487976, | |
| "reward_std": 0.23210693895816803, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.052375003695487976, | |
| "rewards/alfworld_rollout_reward_func/std": 0.38171443343162537, | |
| "sampling/importance_sampling_ratio/max": 2.6272292137145996, | |
| "sampling/importance_sampling_ratio/mean": 0.7341758608818054, | |
| "sampling/importance_sampling_ratio/min": 0.03513141721487045, | |
| "sampling/sampling_logp_difference/max": 1.0201144218444824, | |
| "sampling/sampling_logp_difference/mean": 0.01651364006102085, | |
| "step": 80, | |
| "step_time": 256.61792850801066 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4864.0, | |
| "completions/max_terminated_length": 4864.0, | |
| "completions/mean_length": 4409.25, | |
| "completions/mean_terminated_length": 4409.25, | |
| "completions/min_length": 1956.0, | |
| "completions/min_terminated_length": 1956.0, | |
| "entropy": 0.4008057755418122, | |
| "epoch": 0.648, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.48246461153030396, | |
| "kl": 0.006945399109099526, | |
| "learning_rate": 7.604245525660372e-06, | |
| "loss": -0.4693, | |
| "num_tokens": 17554830.0, | |
| "reward": 0.05581250041723251, | |
| "reward_std": 0.0946279764175415, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.05581250041723251, | |
| "rewards/alfworld_rollout_reward_func/std": 0.3801369369029999, | |
| "sampling/importance_sampling_ratio/max": 2.2769978046417236, | |
| "sampling/importance_sampling_ratio/mean": 0.3646348714828491, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9721014499664307, | |
| "sampling/sampling_logp_difference/mean": 0.02023322880268097, | |
| "step": 81, | |
| "step_time": 234.1297659530137 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5459.0, | |
| "completions/max_terminated_length": 5459.0, | |
| "completions/mean_length": 2836.8125, | |
| "completions/mean_terminated_length": 2836.8125, | |
| "completions/min_length": 746.0, | |
| "completions/min_terminated_length": 746.0, | |
| "entropy": 0.3316339133307338, | |
| "epoch": 0.656, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.617982029914856, | |
| "kl": 0.01853690284769982, | |
| "learning_rate": 7.307312337476421e-06, | |
| "loss": 0.9743, | |
| "num_tokens": 17663112.0, | |
| "reward": 0.5675625205039978, | |
| "reward_std": 0.4371797442436218, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5675625205039978, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5398529171943665, | |
| "sampling/importance_sampling_ratio/max": 2.917351007461548, | |
| "sampling/importance_sampling_ratio/mean": 0.7250344753265381, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 49.39155197143555, | |
| "sampling/sampling_logp_difference/mean": 0.026660067960619926, | |
| "step": 82, | |
| "step_time": 163.41237712898874 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8902.0, | |
| "completions/max_terminated_length": 8902.0, | |
| "completions/mean_length": 2005.8125, | |
| "completions/mean_terminated_length": 2005.8125, | |
| "completions/min_length": 872.0, | |
| "completions/min_terminated_length": 872.0, | |
| "entropy": 0.31633000262081623, | |
| "epoch": 0.664, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.2705594599246979, | |
| "kl": 0.0168228562688455, | |
| "learning_rate": 7.013879381113055e-06, | |
| "loss": 0.0649, | |
| "num_tokens": 17751938.0, | |
| "reward": 0.9136250019073486, | |
| "reward_std": 0.14719459414482117, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.9136250019073486, | |
| "rewards/alfworld_rollout_reward_func/std": 0.26975658535957336, | |
| "sampling/importance_sampling_ratio/max": 2.27459454536438, | |
| "sampling/importance_sampling_ratio/mean": 0.6756496429443359, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0981965065002441, | |
| "sampling/sampling_logp_difference/mean": 0.022044427692890167, | |
| "step": 83, | |
| "step_time": 200.4008226580081 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6071.0, | |
| "completions/max_terminated_length": 6071.0, | |
| "completions/mean_length": 2893.40625, | |
| "completions/mean_terminated_length": 2893.40625, | |
| "completions/min_length": 822.0, | |
| "completions/min_terminated_length": 822.0, | |
| "entropy": 0.26535007590427995, | |
| "epoch": 0.672, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24309861660003662, | |
| "kl": 0.013757865759544075, | |
| "learning_rate": 6.7241444507450474e-06, | |
| "loss": -0.0118, | |
| "num_tokens": 17863375.0, | |
| "reward": 0.7944375276565552, | |
| "reward_std": 0.3526097536087036, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.7944375276565552, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4054880440235138, | |
| "sampling/importance_sampling_ratio/max": 2.7985546588897705, | |
| "sampling/importance_sampling_ratio/mean": 0.6256030797958374, | |
| "sampling/importance_sampling_ratio/min": 0.02893834561109543, | |
| "sampling/sampling_logp_difference/max": 1.398754596710205, | |
| "sampling/sampling_logp_difference/mean": 0.017740968614816666, | |
| "step": 84, | |
| "step_time": 165.64675680000073 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4573.0, | |
| "completions/max_terminated_length": 4573.0, | |
| "completions/mean_length": 3059.5625, | |
| "completions/mean_terminated_length": 3059.5625, | |
| "completions/min_length": 682.0, | |
| "completions/min_terminated_length": 682.0, | |
| "entropy": 0.4013794925995171, | |
| "epoch": 0.68, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.32808718085289, | |
| "kl": 0.01802923761715647, | |
| "learning_rate": 6.438302847821043e-06, | |
| "loss": 0.1523, | |
| "num_tokens": 17977121.0, | |
| "reward": 0.33393749594688416, | |
| "reward_std": 0.5907015204429626, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.33393749594688416, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5358816385269165, | |
| "sampling/importance_sampling_ratio/max": 1.924164891242981, | |
| "sampling/importance_sampling_ratio/mean": 0.3141007125377655, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 51.21648025512695, | |
| "sampling/sampling_logp_difference/mean": 0.03382871672511101, | |
| "step": 85, | |
| "step_time": 164.81202543501058 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10292.0, | |
| "completions/max_terminated_length": 10292.0, | |
| "completions/mean_length": 8584.96875, | |
| "completions/mean_terminated_length": 8584.96875, | |
| "completions/min_length": 2859.0, | |
| "completions/min_terminated_length": 2859.0, | |
| "entropy": 0.3392215413041413, | |
| "epoch": 0.688, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6542012095451355, | |
| "kl": 0.011928978288779035, | |
| "learning_rate": 6.1565472494168055e-06, | |
| "loss": 0.2016, | |
| "num_tokens": 18277600.0, | |
| "reward": 0.0807500034570694, | |
| "reward_std": 0.30580368638038635, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.0807500034570694, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4279302656650543, | |
| "sampling/importance_sampling_ratio/max": 2.9803245067596436, | |
| "sampling/importance_sampling_ratio/mean": 0.6060307025909424, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7128372192382812, | |
| "sampling/sampling_logp_difference/mean": 0.022582147270441055, | |
| "step": 86, | |
| "step_time": 378.8619575729863 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9014.0, | |
| "completions/max_terminated_length": 9014.0, | |
| "completions/mean_length": 5255.03125, | |
| "completions/mean_terminated_length": 5255.03125, | |
| "completions/min_length": 2081.0, | |
| "completions/min_terminated_length": 2081.0, | |
| "entropy": 0.27021760190837085, | |
| "epoch": 0.696, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5374975800514221, | |
| "kl": 0.009437592703761766, | |
| "learning_rate": 5.879067578357521e-06, | |
| "loss": 0.851, | |
| "num_tokens": 18470369.0, | |
| "reward": 0.5600000023841858, | |
| "reward_std": 0.4250272810459137, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.5600000023841858, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5282309055328369, | |
| "sampling/importance_sampling_ratio/max": 2.8345797061920166, | |
| "sampling/importance_sampling_ratio/mean": 0.8371989130973816, | |
| "sampling/importance_sampling_ratio/min": 0.00144236593041569, | |
| "sampling/sampling_logp_difference/max": 1.072211742401123, | |
| "sampling/sampling_logp_difference/mean": 0.017154190689325333, | |
| "step": 87, | |
| "step_time": 283.20546714899683 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6547.0, | |
| "completions/max_terminated_length": 6547.0, | |
| "completions/mean_length": 5855.5, | |
| "completions/mean_terminated_length": 5855.5, | |
| "completions/min_length": 3651.0, | |
| "completions/min_terminated_length": 3651.0, | |
| "entropy": 0.3743594288825989, | |
| "epoch": 0.704, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.3856624364852905, | |
| "kl": 0.0597444533923408, | |
| "learning_rate": 5.6060508751966186e-06, | |
| "loss": 0.8742, | |
| "num_tokens": 18676593.0, | |
| "reward": -0.04043750464916229, | |
| "reward_std": 0.1648387759923935, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.04043750464916229, | |
| "rewards/alfworld_rollout_reward_func/std": 0.31141039729118347, | |
| "sampling/importance_sampling_ratio/max": 2.986142635345459, | |
| "sampling/importance_sampling_ratio/mean": 0.5198343992233276, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3955113887786865, | |
| "sampling/sampling_logp_difference/mean": 0.02004336006939411, | |
| "step": 88, | |
| "step_time": 254.65699046499503 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6107.0, | |
| "completions/max_terminated_length": 6107.0, | |
| "completions/mean_length": 2628.4375, | |
| "completions/mean_terminated_length": 2628.4375, | |
| "completions/min_length": 1046.0, | |
| "completions/min_terminated_length": 1046.0, | |
| "entropy": 0.2884806345682591, | |
| "epoch": 0.712, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4651772379875183, | |
| "kl": 0.012305404627113603, | |
| "learning_rate": 5.3376811721374765e-06, | |
| "loss": 0.1412, | |
| "num_tokens": 18777311.0, | |
| "reward": 0.6704374551773071, | |
| "reward_std": 0.28652966022491455, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6704374551773071, | |
| "rewards/alfworld_rollout_reward_func/std": 0.486878365278244, | |
| "sampling/importance_sampling_ratio/max": 2.315708875656128, | |
| "sampling/importance_sampling_ratio/mean": 0.7435892820358276, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3662152290344238, | |
| "sampling/sampling_logp_difference/mean": 0.02060602232813835, | |
| "step": 89, | |
| "step_time": 160.6579905329927 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5816.0, | |
| "completions/max_terminated_length": 5816.0, | |
| "completions/mean_length": 4033.40625, | |
| "completions/mean_terminated_length": 4033.40625, | |
| "completions/min_length": 1956.0, | |
| "completions/min_terminated_length": 1956.0, | |
| "entropy": 0.39709911681711674, | |
| "epoch": 0.72, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5232568383216858, | |
| "kl": 0.008380314080568496, | |
| "learning_rate": 5.074139368982922e-06, | |
| "loss": -0.0378, | |
| "num_tokens": 18921996.0, | |
| "reward": 0.36518752574920654, | |
| "reward_std": 0.5018154978752136, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.36518752574920654, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5176995992660522, | |
| "sampling/importance_sampling_ratio/max": 2.772024393081665, | |
| "sampling/importance_sampling_ratio/mean": 0.5188237428665161, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2695866823196411, | |
| "sampling/sampling_logp_difference/mean": 0.021091943606734276, | |
| "step": 90, | |
| "step_time": 223.63505906601858 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8433.0, | |
| "completions/max_terminated_length": 8433.0, | |
| "completions/mean_length": 5452.28125, | |
| "completions/mean_terminated_length": 5452.28125, | |
| "completions/min_length": 1688.0, | |
| "completions/min_terminated_length": 1688.0, | |
| "entropy": 0.31468175584450364, | |
| "epoch": 0.728, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6314539909362793, | |
| "kl": 0.010496957955183461, | |
| "learning_rate": 4.81560311119624e-06, | |
| "loss": 0.7913, | |
| "num_tokens": 19119445.0, | |
| "reward": 0.515999972820282, | |
| "reward_std": 0.49320709705352783, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.515999972820282, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5465896725654602, | |
| "sampling/importance_sampling_ratio/max": 2.785933017730713, | |
| "sampling/importance_sampling_ratio/mean": 0.6670159697532654, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.1129628419876099, | |
| "sampling/sampling_logp_difference/mean": 0.019474081695079803, | |
| "step": 91, | |
| "step_time": 287.403252778 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4995.0, | |
| "completions/max_terminated_length": 4995.0, | |
| "completions/mean_length": 3509.4375, | |
| "completions/mean_terminated_length": 3509.4375, | |
| "completions/min_length": 1214.0, | |
| "completions/min_terminated_length": 1214.0, | |
| "entropy": 0.40855799289420247, | |
| "epoch": 0.736, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7601175308227539, | |
| "kl": 0.010548980098974425, | |
| "learning_rate": 4.562246670155769e-06, | |
| "loss": -0.6266, | |
| "num_tokens": 19245923.0, | |
| "reward": 0.2528125047683716, | |
| "reward_std": 0.4320943057537079, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.2528125047683716, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5050470232963562, | |
| "sampling/importance_sampling_ratio/max": 1.8565300703048706, | |
| "sampling/importance_sampling_ratio/mean": 0.5377044081687927, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7181758880615234, | |
| "sampling/sampling_logp_difference/mean": 0.022393332794308662, | |
| "step": 92, | |
| "step_time": 197.56333248000374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7078.0, | |
| "completions/max_terminated_length": 7078.0, | |
| "completions/mean_length": 3108.125, | |
| "completions/mean_terminated_length": 3108.125, | |
| "completions/min_length": 727.0, | |
| "completions/min_terminated_length": 727.0, | |
| "entropy": 0.27243693731725216, | |
| "epoch": 0.744, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20756395161151886, | |
| "kl": 0.012253020591742825, | |
| "learning_rate": 4.314240825683938e-06, | |
| "loss": 0.1539, | |
| "num_tokens": 19366375.0, | |
| "reward": 0.6510000228881836, | |
| "reward_std": 0.4009626507759094, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6510000228881836, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5017029047012329, | |
| "sampling/importance_sampling_ratio/max": 1.8515329360961914, | |
| "sampling/importance_sampling_ratio/mean": 0.6361359357833862, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 51.044925689697266, | |
| "sampling/sampling_logp_difference/mean": 0.023394249379634857, | |
| "step": 93, | |
| "step_time": 195.112117294997 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 5563.0, | |
| "completions/max_terminated_length": 5298.0, | |
| "completions/mean_length": 3876.15625, | |
| "completions/mean_terminated_length": 3821.741943359375, | |
| "completions/min_length": 1038.0, | |
| "completions/min_terminated_length": 1038.0, | |
| "entropy": 0.3827931974083185, | |
| "epoch": 0.752, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22425270080566406, | |
| "kl": 0.009305694955401123, | |
| "learning_rate": 4.071752750929776e-06, | |
| "loss": -0.2032, | |
| "num_tokens": 19505132.0, | |
| "reward": 0.09706249833106995, | |
| "reward_std": 0.36825689673423767, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.09706249833106995, | |
| "rewards/alfworld_rollout_reward_func/std": 0.45590633153915405, | |
| "sampling/importance_sampling_ratio/max": 1.0314832925796509, | |
| "sampling/importance_sampling_ratio/mean": 0.26068440079689026, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.0274133682250977, | |
| "sampling/sampling_logp_difference/mean": 0.021985212340950966, | |
| "step": 94, | |
| "step_time": 213.6653461980095 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10037.0, | |
| "completions/max_terminated_length": 10037.0, | |
| "completions/mean_length": 7284.71875, | |
| "completions/mean_terminated_length": 7284.71875, | |
| "completions/min_length": 1864.0, | |
| "completions/min_terminated_length": 1864.0, | |
| "entropy": 0.32221671054139733, | |
| "epoch": 0.76, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28681808710098267, | |
| "kl": 0.009068961640878115, | |
| "learning_rate": 3.834945899682642e-06, | |
| "loss": 0.2213, | |
| "num_tokens": 19762819.0, | |
| "reward": 0.211062490940094, | |
| "reward_std": 0.41743093729019165, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.211062490940094, | |
| "rewards/alfworld_rollout_reward_func/std": 0.4870281219482422, | |
| "sampling/importance_sampling_ratio/max": 2.7327306270599365, | |
| "sampling/importance_sampling_ratio/mean": 0.4521108865737915, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.1790721416473389, | |
| "sampling/sampling_logp_difference/mean": 0.020200295373797417, | |
| "step": 95, | |
| "step_time": 362.24372679200314 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6032.0, | |
| "completions/max_terminated_length": 6032.0, | |
| "completions/mean_length": 3912.6875, | |
| "completions/mean_terminated_length": 3912.6875, | |
| "completions/min_length": 1226.0, | |
| "completions/min_terminated_length": 1226.0, | |
| "entropy": 0.3016722968313843, | |
| "epoch": 0.768, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7907325625419617, | |
| "kl": 0.007123322902771179, | |
| "learning_rate": 3.6039798961929995e-06, | |
| "loss": 2.2501, | |
| "num_tokens": 19905657.0, | |
| "reward": 0.29868751764297485, | |
| "reward_std": 0.4651522636413574, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.29868751764297485, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5572153329849243, | |
| "sampling/importance_sampling_ratio/max": 2.9006409645080566, | |
| "sampling/importance_sampling_ratio/mean": 0.6974336504936218, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.1841108798980713, | |
| "sampling/sampling_logp_difference/mean": 0.01824130117893219, | |
| "step": 96, | |
| "step_time": 207.6763199779889 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 13391.0, | |
| "completions/max_terminated_length": 13391.0, | |
| "completions/mean_length": 9185.15625, | |
| "completions/mean_terminated_length": 9185.15625, | |
| "completions/min_length": 1412.0, | |
| "completions/min_terminated_length": 1412.0, | |
| "entropy": 0.3281913371756673, | |
| "epoch": 0.776, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5702968835830688, | |
| "kl": 0.01132094238710124, | |
| "learning_rate": 3.379010427574625e-06, | |
| "loss": 1.5224, | |
| "num_tokens": 20232670.0, | |
| "reward": 0.33243751525878906, | |
| "reward_std": 0.5196202993392944, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.33243751525878906, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5371140837669373, | |
| "sampling/importance_sampling_ratio/max": 2.960601568222046, | |
| "sampling/importance_sampling_ratio/mean": 0.5209211111068726, | |
| "sampling/importance_sampling_ratio/min": 1.0663071004331643e-30, | |
| "sampling/sampling_logp_difference/max": 53.7265510559082, | |
| "sampling/sampling_logp_difference/mean": 0.023852139711380005, | |
| "step": 97, | |
| "step_time": 476.51617424899814 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 9493.0, | |
| "completions/max_terminated_length": 9493.0, | |
| "completions/mean_length": 8688.34375, | |
| "completions/mean_terminated_length": 8688.34375, | |
| "completions/min_length": 7963.0, | |
| "completions/min_terminated_length": 7963.0, | |
| "entropy": 0.2913727913983166, | |
| "epoch": 0.784, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.47073104977607727, | |
| "kl": 0.016036324086599052, | |
| "learning_rate": 3.160189138860671e-06, | |
| "loss": 0.1501, | |
| "num_tokens": 20535273.0, | |
| "reward": -0.08656249940395355, | |
| "reward_std": 0.028624679893255234, | |
| "rewards/alfworld_rollout_reward_func/mean": -0.08656249940395355, | |
| "rewards/alfworld_rollout_reward_func/std": 0.03789709135890007, | |
| "sampling/importance_sampling_ratio/max": 1.8403443098068237, | |
| "sampling/importance_sampling_ratio/mean": 0.3093043565750122, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.646960735321045, | |
| "sampling/sampling_logp_difference/mean": 0.020046139135956764, | |
| "step": 98, | |
| "step_time": 365.8523392380048 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5668.0, | |
| "completions/max_terminated_length": 5668.0, | |
| "completions/mean_length": 4537.65625, | |
| "completions/mean_terminated_length": 4537.65625, | |
| "completions/min_length": 1378.0, | |
| "completions/min_terminated_length": 1378.0, | |
| "entropy": 0.3762055607512593, | |
| "epoch": 0.792, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27318060398101807, | |
| "kl": 0.00881329235562589, | |
| "learning_rate": 2.947663530784388e-06, | |
| "loss": 0.4014, | |
| "num_tokens": 20697662.0, | |
| "reward": 0.2487500011920929, | |
| "reward_std": 0.48654043674468994, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.2487500011920929, | |
| "rewards/alfworld_rollout_reward_func/std": 0.5086416006088257, | |
| "sampling/importance_sampling_ratio/max": 1.298455834388733, | |
| "sampling/importance_sampling_ratio/mean": 0.33954960107803345, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 57.009605407714844, | |
| "sampling/sampling_logp_difference/mean": 0.023928016424179077, | |
| "step": 99, | |
| "step_time": 227.85088505800013 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7124.0, | |
| "completions/max_terminated_length": 7124.0, | |
| "completions/mean_length": 3760.96875, | |
| "completions/mean_terminated_length": 3760.96875, | |
| "completions/min_length": 1125.0, | |
| "completions/min_terminated_length": 1125.0, | |
| "entropy": 0.3388551725074649, | |
| "epoch": 0.8, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3301438093185425, | |
| "kl": 0.014853068540105596, | |
| "learning_rate": 2.7415768603533996e-06, | |
| "loss": 0.0864, | |
| "num_tokens": 20837821.0, | |
| "reward": 0.6660000085830688, | |
| "reward_std": 0.4354756474494934, | |
| "rewards/alfworld_rollout_reward_func/mean": 0.6660000085830688, | |
| "rewards/alfworld_rollout_reward_func/std": 0.486807644367218, | |
| "sampling/importance_sampling_ratio/max": 2.7073638439178467, | |
| "sampling/importance_sampling_ratio/mean": 0.5111602544784546, | |
| "sampling/importance_sampling_ratio/min": 0.004309141077101231, | |
| "sampling/sampling_logp_difference/max": 1.547012209892273, | |
| "sampling/sampling_logp_difference/mean": 0.021454855799674988, | |
| "step": 100, | |
| "step_time": 210.89971194700775 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 125, | |
| "num_input_tokens_seen": 20837821, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |