Text Generation
PEFT
Safetensors
Transformers
llama
grpo
lora
trl
conversational
text-generation-inference
Instructions to use Gege24/Liars_dice_final_memek with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/Liars_dice_final_memek with PEFT:
Base model is not found.
- Transformers
How to use Gege24/Liars_dice_final_memek with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/Liars_dice_final_memek") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("Gege24/Liars_dice_final_memek") model = AutoModelForCausalLM.from_pretrained("Gege24/Liars_dice_final_memek") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use Gege24/Liars_dice_final_memek with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/Liars_dice_final_memek" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/Liars_dice_final_memek", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/Liars_dice_final_memek
- SGLang
How to use Gege24/Liars_dice_final_memek with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/Liars_dice_final_memek" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/Liars_dice_final_memek", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/Liars_dice_final_memek" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/Liars_dice_final_memek", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/Liars_dice_final_memek with Docker Model Runner:
docker model run hf.co/Gege24/Liars_dice_final_memek
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.00225, | |
| "eval_steps": 500, | |
| "global_step": 225, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 839.0, | |
| "completions/max_terminated_length": 839.0, | |
| "completions/mean_length": 265.75, | |
| "completions/mean_terminated_length": 265.75, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.740012645721436, | |
| "epoch": 1e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.019221410155296326, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0006, | |
| "num_tokens": 45751.0, | |
| "reward": 0.816877007484436, | |
| "reward_std": 1.4014036655426025, | |
| "rewards/rollout_reward_func/mean": 0.816877007484436, | |
| "rewards/rollout_reward_func/std": 1.6075319051742554, | |
| "sampling/importance_sampling_ratio/max": 0.03914691507816315, | |
| "sampling/importance_sampling_ratio/mean": 0.013615390285849571, | |
| "sampling/importance_sampling_ratio/min": 1.1552421904970122e-15, | |
| "sampling/sampling_logp_difference/max": 3.914313554763794, | |
| "sampling/sampling_logp_difference/mean": 1.6371219158172607, | |
| "step": 1, | |
| "step_time": 9.72409536699979 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.740012645721436, | |
| "epoch": 2e-05, | |
| "grad_norm": 0.01974678784608841, | |
| "kl": 0.0, | |
| "learning_rate": 2.8571428571428575e-07, | |
| "loss": -0.0006, | |
| "step": 2, | |
| "step_time": 4.797613267999623 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 760.0, | |
| "completions/max_terminated_length": 760.0, | |
| "completions/mean_length": 412.5625, | |
| "completions/mean_terminated_length": 412.5625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.820557475090027, | |
| "epoch": 3e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011553559452295303, | |
| "kl": 0.0009907482417474966, | |
| "learning_rate": 5.714285714285715e-07, | |
| "loss": 0.0001, | |
| "num_tokens": 99312.0, | |
| "reward": 2.8992574214935303, | |
| "reward_std": 1.8266513347625732, | |
| "rewards/rollout_reward_func/mean": 2.8992574214935303, | |
| "rewards/rollout_reward_func/std": 1.9147884845733643, | |
| "sampling/importance_sampling_ratio/max": 0.0381130687892437, | |
| "sampling/importance_sampling_ratio/mean": 0.009180868044495583, | |
| "sampling/importance_sampling_ratio/min": 9.134832647250679e-12, | |
| "sampling/sampling_logp_difference/max": 3.4724807739257812, | |
| "sampling/sampling_logp_difference/mean": 1.698885440826416, | |
| "step": 3, | |
| "step_time": 8.947379413000363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.818018198013306, | |
| "epoch": 4e-05, | |
| "grad_norm": 0.011284240521490574, | |
| "kl": 0.0009902061865432188, | |
| "learning_rate": 8.571428571428572e-07, | |
| "loss": 0.0001, | |
| "step": 4, | |
| "step_time": 5.229739129000336 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 571.0, | |
| "completions/max_terminated_length": 571.0, | |
| "completions/mean_length": 165.09375, | |
| "completions/mean_terminated_length": 165.09375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.794174313545227, | |
| "epoch": 5e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028869854286313057, | |
| "kl": 0.0008135313764796592, | |
| "learning_rate": 1.142857142857143e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 141029.0, | |
| "reward": 1.8564525842666626, | |
| "reward_std": 2.077150344848633, | |
| "rewards/rollout_reward_func/mean": 1.8564525842666626, | |
| "rewards/rollout_reward_func/std": 2.0850281715393066, | |
| "sampling/importance_sampling_ratio/max": 0.05348784476518631, | |
| "sampling/importance_sampling_ratio/mean": 0.017640406265854836, | |
| "sampling/importance_sampling_ratio/min": 0.00042824808042496443, | |
| "sampling/sampling_logp_difference/max": 2.332674980163574, | |
| "sampling/sampling_logp_difference/mean": 1.7570207118988037, | |
| "step": 5, | |
| "step_time": 8.192974794999373 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.788055181503296, | |
| "epoch": 6e-05, | |
| "grad_norm": 0.029661983251571655, | |
| "kl": 0.0007373044900305104, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": -0.0005, | |
| "step": 6, | |
| "step_time": 4.266827427000862 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 848.0, | |
| "completions/max_terminated_length": 848.0, | |
| "completions/mean_length": 541.6875, | |
| "completions/mean_terminated_length": 541.6875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.803173184394836, | |
| "epoch": 7e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004336625337600708, | |
| "kl": 0.0010216275259153917, | |
| "learning_rate": 1.7142857142857145e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 199637.0, | |
| "reward": 1.30655038356781, | |
| "reward_std": 0.9892024993896484, | |
| "rewards/rollout_reward_func/mean": 1.30655038356781, | |
| "rewards/rollout_reward_func/std": 1.138155221939087, | |
| "sampling/importance_sampling_ratio/max": 0.028351690620183945, | |
| "sampling/importance_sampling_ratio/mean": 0.005052408203482628, | |
| "sampling/importance_sampling_ratio/min": 2.671416343005633e-15, | |
| "sampling/sampling_logp_difference/max": 4.701449394226074, | |
| "sampling/sampling_logp_difference/mean": 1.7298243045806885, | |
| "step": 7, | |
| "step_time": 9.286785637000321 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.789715051651001, | |
| "epoch": 8e-05, | |
| "grad_norm": 0.004228756297379732, | |
| "kl": 0.000888259346538689, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0002, | |
| "step": 8, | |
| "step_time": 4.969429294999827 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0026041667442768812, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 686.0, | |
| "completions/max_terminated_length": 686.0, | |
| "completions/mean_length": 286.75, | |
| "completions/mean_terminated_length": 295.4838562011719, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.3694349527359, | |
| "epoch": 9e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.010730103589594364, | |
| "kl": 0.0009608958062017336, | |
| "learning_rate": 2.285714285714286e-06, | |
| "loss": -0.0, | |
| "num_tokens": 246380.0, | |
| "reward": 2.1979100704193115, | |
| "reward_std": 1.8867942094802856, | |
| "rewards/rollout_reward_func/mean": 2.1979100704193115, | |
| "rewards/rollout_reward_func/std": 2.1932425498962402, | |
| "sampling/importance_sampling_ratio/max": 0.03839043155312538, | |
| "sampling/importance_sampling_ratio/mean": 0.012242316268384457, | |
| "sampling/importance_sampling_ratio/min": 3.078865162819966e-08, | |
| "sampling/sampling_logp_difference/max": 3.9608242511749268, | |
| "sampling/sampling_logp_difference/mean": 1.4708642959594727, | |
| "step": 9, | |
| "step_time": 8.68651038400003 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.358309984207153, | |
| "epoch": 0.0001, | |
| "grad_norm": 0.0108121233060956, | |
| "kl": 0.000685311508277664, | |
| "learning_rate": 2.571428571428571e-06, | |
| "loss": -0.0, | |
| "step": 10, | |
| "step_time": 5.445634689999224 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0028409091755747795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 803.0, | |
| "completions/max_terminated_length": 803.0, | |
| "completions/mean_length": 553.90625, | |
| "completions/mean_terminated_length": 553.90625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.565176486968994, | |
| "epoch": 0.00011, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0031674716155976057, | |
| "kl": 0.0009231339645339176, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 306137.0, | |
| "reward": 1.9738842248916626, | |
| "reward_std": 1.5114688873291016, | |
| "rewards/rollout_reward_func/mean": 1.9738842248916626, | |
| "rewards/rollout_reward_func/std": 1.8342463970184326, | |
| "sampling/importance_sampling_ratio/max": 0.02065931260585785, | |
| "sampling/importance_sampling_ratio/mean": 0.0028495141305029392, | |
| "sampling/importance_sampling_ratio/min": 5.273884899763661e-19, | |
| "sampling/sampling_logp_difference/max": 3.751443862915039, | |
| "sampling/sampling_logp_difference/mean": 1.6439871788024902, | |
| "step": 11, | |
| "step_time": 9.134554064999975 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0028409091755747795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 8.561085760593414, | |
| "epoch": 0.00012, | |
| "grad_norm": 0.0026493030600249767, | |
| "kl": 0.0009400276176165789, | |
| "learning_rate": 3.142857142857143e-06, | |
| "loss": 0.0001, | |
| "step": 12, | |
| "step_time": 4.8191932119998455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0052083334885537624, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 571.0, | |
| "completions/max_terminated_length": 571.0, | |
| "completions/mean_length": 162.4375, | |
| "completions/mean_terminated_length": 162.4375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.512920141220093, | |
| "epoch": 0.00013, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013727321289479733, | |
| "kl": 0.0018250496359542012, | |
| "learning_rate": 3.428571428571429e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 349131.0, | |
| "reward": 1.47185218334198, | |
| "reward_std": 1.5472846031188965, | |
| "rewards/rollout_reward_func/mean": 1.47185218334198, | |
| "rewards/rollout_reward_func/std": 2.0390946865081787, | |
| "sampling/importance_sampling_ratio/max": 0.049390941858291626, | |
| "sampling/importance_sampling_ratio/mean": 0.020272064954042435, | |
| "sampling/importance_sampling_ratio/min": 2.633779558891547e-06, | |
| "sampling/sampling_logp_difference/max": 2.360596179962158, | |
| "sampling/sampling_logp_difference/mean": 1.5058674812316895, | |
| "step": 13, | |
| "step_time": 7.890245483999934 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "entropy": 8.52327024936676, | |
| "epoch": 0.00014, | |
| "grad_norm": 0.01371886394917965, | |
| "kl": 0.002251528945635073, | |
| "learning_rate": 3.7142857142857146e-06, | |
| "loss": -0.0004, | |
| "step": 14, | |
| "step_time": 4.277807705000669 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 866.0, | |
| "completions/max_terminated_length": 866.0, | |
| "completions/mean_length": 493.03125, | |
| "completions/mean_terminated_length": 508.4193420410156, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.769269943237305, | |
| "epoch": 0.00015, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004188260994851589, | |
| "kl": 0.0018109382945112884, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 405372.0, | |
| "reward": 2.176278591156006, | |
| "reward_std": 1.8223553895950317, | |
| "rewards/rollout_reward_func/mean": 2.176278591156006, | |
| "rewards/rollout_reward_func/std": 1.8436557054519653, | |
| "sampling/importance_sampling_ratio/max": 0.021065089851617813, | |
| "sampling/importance_sampling_ratio/mean": 0.0036145278718322515, | |
| "sampling/importance_sampling_ratio/min": 4.2438622060991804e-13, | |
| "sampling/sampling_logp_difference/max": 3.6341652870178223, | |
| "sampling/sampling_logp_difference/mean": 1.674858808517456, | |
| "step": 15, | |
| "step_time": 9.12190689900035 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.767740607261658, | |
| "epoch": 0.00016, | |
| "grad_norm": 0.004105101805180311, | |
| "kl": 0.002771631450741552, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": -0.0001, | |
| "step": 16, | |
| "step_time": 5.4121323870003835 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 733.0, | |
| "completions/max_terminated_length": 733.0, | |
| "completions/mean_length": 420.0, | |
| "completions/mean_terminated_length": 421.7241516113281, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.60806268453598, | |
| "epoch": 0.00017, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009108365513384342, | |
| "kl": 0.004436066417838447, | |
| "learning_rate": 4.571428571428572e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 458204.0, | |
| "reward": 2.0131897926330566, | |
| "reward_std": 1.7921838760375977, | |
| "rewards/rollout_reward_func/mean": 2.0131897926330566, | |
| "rewards/rollout_reward_func/std": 1.917612910270691, | |
| "sampling/importance_sampling_ratio/max": 0.03794016316533089, | |
| "sampling/importance_sampling_ratio/mean": 0.00851379707455635, | |
| "sampling/importance_sampling_ratio/min": 5.970022844210435e-30, | |
| "sampling/sampling_logp_difference/max": 3.762781858444214, | |
| "sampling/sampling_logp_difference/mean": 1.7653887271881104, | |
| "step": 17, | |
| "step_time": 8.833719273000042 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0017361111240461469, | |
| "entropy": 8.605763673782349, | |
| "epoch": 0.00018, | |
| "grad_norm": 0.009003642946481705, | |
| "kl": 0.004826090880669653, | |
| "learning_rate": 4.857142857142858e-06, | |
| "loss": 0.0001, | |
| "step": 18, | |
| "step_time": 4.669556054000623 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 778.0, | |
| "completions/max_terminated_length": 778.0, | |
| "completions/mean_length": 356.1875, | |
| "completions/mean_terminated_length": 356.1875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.589665651321411, | |
| "epoch": 0.00019, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012068537063896656, | |
| "kl": 0.007034957525320351, | |
| "learning_rate": 5.142857142857142e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 510714.0, | |
| "reward": 1.6357378959655762, | |
| "reward_std": 1.9768089056015015, | |
| "rewards/rollout_reward_func/mean": 1.6357378959655762, | |
| "rewards/rollout_reward_func/std": 1.9230132102966309, | |
| "sampling/importance_sampling_ratio/max": 0.0377332866191864, | |
| "sampling/importance_sampling_ratio/mean": 0.00977294985204935, | |
| "sampling/importance_sampling_ratio/min": 1.5810989850706392e-08, | |
| "sampling/sampling_logp_difference/max": 2.8323609828948975, | |
| "sampling/sampling_logp_difference/mean": 1.57611083984375, | |
| "step": 19, | |
| "step_time": 8.702261807000923 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 8.588200807571411, | |
| "epoch": 0.0002, | |
| "grad_norm": 0.01012762077152729, | |
| "kl": 0.008162530430126935, | |
| "learning_rate": 5.428571428571429e-06, | |
| "loss": 0.0003, | |
| "step": 20, | |
| "step_time": 4.793898748000174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 725.0, | |
| "completions/max_terminated_length": 676.0, | |
| "completions/mean_length": 169.8125, | |
| "completions/mean_terminated_length": 156.43333435058594, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.763131499290466, | |
| "epoch": 0.00021, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014919566921889782, | |
| "kl": 0.021441200777189806, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": -0.0005, | |
| "num_tokens": 554259.0, | |
| "reward": 2.2263033390045166, | |
| "reward_std": 1.681884765625, | |
| "rewards/rollout_reward_func/mean": 2.2263033390045166, | |
| "rewards/rollout_reward_func/std": 1.8072566986083984, | |
| "sampling/importance_sampling_ratio/max": 0.08593336492776871, | |
| "sampling/importance_sampling_ratio/mean": 0.022409576922655106, | |
| "sampling/importance_sampling_ratio/min": 2.349876534956627e-22, | |
| "sampling/sampling_logp_difference/max": 4.372560501098633, | |
| "sampling/sampling_logp_difference/mean": 1.931687831878662, | |
| "step": 21, | |
| "step_time": 8.055510063999009 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.737543225288391, | |
| "epoch": 0.00022, | |
| "grad_norm": 0.014876801520586014, | |
| "kl": 0.029705224180361256, | |
| "learning_rate": 6e-06, | |
| "loss": -0.0005, | |
| "step": 22, | |
| "step_time": 5.419155312999919 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 758.0, | |
| "completions/max_terminated_length": 758.0, | |
| "completions/mean_length": 182.1875, | |
| "completions/mean_terminated_length": 171.50001525878906, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.859729766845703, | |
| "epoch": 0.00023, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029221149161458015, | |
| "kl": 0.0323223132872954, | |
| "learning_rate": 6.285714285714286e-06, | |
| "loss": -0.0011, | |
| "num_tokens": 596571.0, | |
| "reward": 1.5583102703094482, | |
| "reward_std": 1.6529381275177002, | |
| "rewards/rollout_reward_func/mean": 1.5583102703094482, | |
| "rewards/rollout_reward_func/std": 1.7341761589050293, | |
| "sampling/importance_sampling_ratio/max": 0.06399935483932495, | |
| "sampling/importance_sampling_ratio/mean": 0.025177521631121635, | |
| "sampling/importance_sampling_ratio/min": 1.0080106696608216e-18, | |
| "sampling/sampling_logp_difference/max": 3.969541072845459, | |
| "sampling/sampling_logp_difference/mean": 1.8030247688293457, | |
| "step": 23, | |
| "step_time": 8.169906658999935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 8.80251955986023, | |
| "epoch": 0.00024, | |
| "grad_norm": 0.029200905933976173, | |
| "kl": 0.04814133094623685, | |
| "learning_rate": 6.571428571428572e-06, | |
| "loss": -0.0012, | |
| "step": 24, | |
| "step_time": 4.54475868199961 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 517.65625, | |
| "completions/mean_terminated_length": 530.7333374023438, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.177129030227661, | |
| "epoch": 0.00025, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008773105219006538, | |
| "kl": 0.03207368147559464, | |
| "learning_rate": 6.857142857142858e-06, | |
| "loss": -0.0007, | |
| "num_tokens": 653422.0, | |
| "reward": 2.0930328369140625, | |
| "reward_std": 1.470797061920166, | |
| "rewards/rollout_reward_func/mean": 2.0930328369140625, | |
| "rewards/rollout_reward_func/std": 1.5851061344146729, | |
| "sampling/importance_sampling_ratio/max": 0.08151775598526001, | |
| "sampling/importance_sampling_ratio/mean": 0.01129196584224701, | |
| "sampling/importance_sampling_ratio/min": 2.138438081125682e-13, | |
| "sampling/sampling_logp_difference/max": 3.773285388946533, | |
| "sampling/sampling_logp_difference/mean": 1.4078741073608398, | |
| "step": 25, | |
| "step_time": 9.088412857000094 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.043181818444281816, | |
| "clip_ratio/high_mean": 0.021590909222140908, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021590909222140908, | |
| "entropy": 8.089222967624664, | |
| "epoch": 0.00026, | |
| "grad_norm": 0.008307461626827717, | |
| "kl": 0.04364914959296584, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": -0.0007, | |
| "step": 26, | |
| "step_time": 4.877906865999648 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 821.0, | |
| "completions/max_terminated_length": 821.0, | |
| "completions/mean_length": 309.9375, | |
| "completions/mean_terminated_length": 309.9375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.212523341178894, | |
| "epoch": 0.00027, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014188895933330059, | |
| "kl": 0.0853999936953187, | |
| "learning_rate": 7.428571428571429e-06, | |
| "loss": -0.001, | |
| "num_tokens": 702104.0, | |
| "reward": 1.7692888975143433, | |
| "reward_std": 1.6422841548919678, | |
| "rewards/rollout_reward_func/mean": 1.7692888975143433, | |
| "rewards/rollout_reward_func/std": 1.989976406097412, | |
| "sampling/importance_sampling_ratio/max": 0.11609657108783722, | |
| "sampling/importance_sampling_ratio/mean": 0.027356663718819618, | |
| "sampling/importance_sampling_ratio/min": 9.024407518154476e-06, | |
| "sampling/sampling_logp_difference/max": 2.4522972106933594, | |
| "sampling/sampling_logp_difference/mean": 1.4562647342681885, | |
| "step": 27, | |
| "step_time": 9.140366876000371 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.12500000186264515, | |
| "clip_ratio/high_mean": 0.06250000093132257, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06250000093132257, | |
| "entropy": 8.088905096054077, | |
| "epoch": 0.00028, | |
| "grad_norm": 0.014231563545763493, | |
| "kl": 0.10766742378473282, | |
| "learning_rate": 7.714285714285716e-06, | |
| "loss": -0.0011, | |
| "step": 28, | |
| "step_time": 5.39836863499977 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 803.0, | |
| "completions/max_terminated_length": 803.0, | |
| "completions/mean_length": 254.8125, | |
| "completions/mean_terminated_length": 262.51611328125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.048985123634338, | |
| "epoch": 0.00029, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015394963324069977, | |
| "kl": 0.1432503336109221, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": -0.001, | |
| "num_tokens": 749391.0, | |
| "reward": 1.818474292755127, | |
| "reward_std": 1.1649471521377563, | |
| "rewards/rollout_reward_func/mean": 1.818474292755127, | |
| "rewards/rollout_reward_func/std": 1.8718231916427612, | |
| "sampling/importance_sampling_ratio/max": 0.130377858877182, | |
| "sampling/importance_sampling_ratio/mean": 0.036613546311855316, | |
| "sampling/importance_sampling_ratio/min": 1.2765659362923287e-10, | |
| "sampling/sampling_logp_difference/max": 3.609269380569458, | |
| "sampling/sampling_logp_difference/mean": 1.4590439796447754, | |
| "step": 29, | |
| "step_time": 8.3333206719999 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.02083333395421505, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.031250000931322575, | |
| "entropy": 7.883777499198914, | |
| "epoch": 0.0003, | |
| "grad_norm": 0.01582338474690914, | |
| "kl": 0.17406905256211758, | |
| "learning_rate": 8.285714285714287e-06, | |
| "loss": -0.0011, | |
| "step": 30, | |
| "step_time": 4.778778166000393 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 541.90625, | |
| "completions/mean_terminated_length": 558.8709716796875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.091109991073608, | |
| "epoch": 0.00031, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.021641016006469727, | |
| "kl": 0.12537556886672974, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 807902.0, | |
| "reward": 1.436830997467041, | |
| "reward_std": 1.0506994724273682, | |
| "rewards/rollout_reward_func/mean": 1.436830997467041, | |
| "rewards/rollout_reward_func/std": 1.2816261053085327, | |
| "sampling/importance_sampling_ratio/max": 0.07519304007291794, | |
| "sampling/importance_sampling_ratio/mean": 0.013723745942115784, | |
| "sampling/importance_sampling_ratio/min": 1.7366062713998758e-16, | |
| "sampling/sampling_logp_difference/max": 5.218207359313965, | |
| "sampling/sampling_logp_difference/mean": 1.5592684745788574, | |
| "step": 31, | |
| "step_time": 9.165752515999884 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 8.013701796531677, | |
| "epoch": 0.00032, | |
| "grad_norm": 0.021766290068626404, | |
| "kl": 0.1357055138796568, | |
| "learning_rate": 8.857142857142858e-06, | |
| "loss": 0.0003, | |
| "step": 32, | |
| "step_time": 4.949108714999966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 663.0, | |
| "completions/max_terminated_length": 662.0, | |
| "completions/mean_length": 355.46875, | |
| "completions/mean_terminated_length": 337.3000183105469, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.517293989658356, | |
| "epoch": 0.00033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.017694007605314255, | |
| "kl": 0.20308297593146563, | |
| "learning_rate": 9.142857142857144e-06, | |
| "loss": -0.0019, | |
| "num_tokens": 859075.0, | |
| "reward": 2.665842056274414, | |
| "reward_std": 2.0510294437408447, | |
| "rewards/rollout_reward_func/mean": 2.665842056274414, | |
| "rewards/rollout_reward_func/std": 2.058197498321533, | |
| "sampling/importance_sampling_ratio/max": 0.1805843561887741, | |
| "sampling/importance_sampling_ratio/mean": 0.033763326704502106, | |
| "sampling/importance_sampling_ratio/min": 9.382639013877456e-15, | |
| "sampling/sampling_logp_difference/max": 4.034926891326904, | |
| "sampling/sampling_logp_difference/mean": 1.42521071434021, | |
| "step": 33, | |
| "step_time": 9.64415260199985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 7.461727142333984, | |
| "epoch": 0.00034, | |
| "grad_norm": 0.01653335802257061, | |
| "kl": 0.21375709865242243, | |
| "learning_rate": 9.42857142857143e-06, | |
| "loss": -0.002, | |
| "step": 34, | |
| "step_time": 4.6672892820001834 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 287.5, | |
| "completions/mean_terminated_length": 272.1612854003906, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.252508640289307, | |
| "epoch": 0.00035, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04618426784873009, | |
| "kl": 0.294549023732543, | |
| "learning_rate": 9.714285714285715e-06, | |
| "loss": -0.001, | |
| "num_tokens": 908468.0, | |
| "reward": 1.9603195190429688, | |
| "reward_std": 1.7872586250305176, | |
| "rewards/rollout_reward_func/mean": 1.9603195190429688, | |
| "rewards/rollout_reward_func/std": 1.841855764389038, | |
| "sampling/importance_sampling_ratio/max": 0.19450248777866364, | |
| "sampling/importance_sampling_ratio/mean": 0.0430486798286438, | |
| "sampling/importance_sampling_ratio/min": 1.1371217567557323e-07, | |
| "sampling/sampling_logp_difference/max": 3.5069642066955566, | |
| "sampling/sampling_logp_difference/mean": 1.187075138092041, | |
| "step": 35, | |
| "step_time": 8.733247314000437 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.010416666977107525, | |
| "clip_ratio/low_mean": 0.015625, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026041666977107525, | |
| "entropy": 7.227072596549988, | |
| "epoch": 0.00036, | |
| "grad_norm": 0.0316547267138958, | |
| "kl": 0.26715745590627193, | |
| "learning_rate": 1e-05, | |
| "loss": -0.0011, | |
| "step": 36, | |
| "step_time": 4.828337421000015 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0069444444961845875, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.014756944496184587, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 778.0, | |
| "completions/max_terminated_length": 778.0, | |
| "completions/mean_length": 390.0, | |
| "completions/mean_terminated_length": 402.06451416015625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.85473507642746, | |
| "epoch": 0.00037, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01456042192876339, | |
| "kl": 0.20423047989606857, | |
| "learning_rate": 9.999999999884322e-06, | |
| "loss": -0.0029, | |
| "num_tokens": 961096.0, | |
| "reward": 3.3347973823547363, | |
| "reward_std": 1.635354995727539, | |
| "rewards/rollout_reward_func/mean": 3.3347973823547363, | |
| "rewards/rollout_reward_func/std": 1.591873288154602, | |
| "sampling/importance_sampling_ratio/max": 0.2037617266178131, | |
| "sampling/importance_sampling_ratio/mean": 0.04216703772544861, | |
| "sampling/importance_sampling_ratio/min": 3.60183348667997e-18, | |
| "sampling/sampling_logp_difference/max": 4.788333415985107, | |
| "sampling/sampling_logp_difference/mean": 1.3821991682052612, | |
| "step": 37, | |
| "step_time": 8.829268903999946 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0069444444961845875, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.014756944496184587, | |
| "entropy": 6.8267329931259155, | |
| "epoch": 0.00038, | |
| "grad_norm": 0.012915832921862602, | |
| "kl": 0.2027184907346964, | |
| "learning_rate": 9.999999999537282e-06, | |
| "loss": -0.003, | |
| "step": 38, | |
| "step_time": 5.279609900000196 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 292.125, | |
| "completions/mean_terminated_length": 291.9666748046875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.531160056591034, | |
| "epoch": 0.00039, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015933845192193985, | |
| "kl": 0.2402525246143341, | |
| "learning_rate": 9.999999998958884e-06, | |
| "loss": -0.0013, | |
| "num_tokens": 1010524.0, | |
| "reward": 1.7904164791107178, | |
| "reward_std": 1.6788225173950195, | |
| "rewards/rollout_reward_func/mean": 1.7904164791107178, | |
| "rewards/rollout_reward_func/std": 1.8581712245941162, | |
| "sampling/importance_sampling_ratio/max": 0.042524565011262894, | |
| "sampling/importance_sampling_ratio/mean": 0.021317776292562485, | |
| "sampling/importance_sampling_ratio/min": 5.162857177539051e-24, | |
| "sampling/sampling_logp_difference/max": 12.552388191223145, | |
| "sampling/sampling_logp_difference/mean": 1.622736930847168, | |
| "step": 39, | |
| "step_time": 9.170014825000635 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.470682978630066, | |
| "epoch": 0.0004, | |
| "grad_norm": 0.014897222630679607, | |
| "kl": 0.2379161100834608, | |
| "learning_rate": 9.999999998149125e-06, | |
| "loss": -0.0013, | |
| "step": 40, | |
| "step_time": 4.626162753000699 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 785.0, | |
| "completions/max_terminated_length": 785.0, | |
| "completions/mean_length": 588.96875, | |
| "completions/mean_terminated_length": 594.258056640625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.788816154003143, | |
| "epoch": 0.00041, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.020633654668927193, | |
| "kl": 0.17094908049330115, | |
| "learning_rate": 9.99999999710801e-06, | |
| "loss": -0.0025, | |
| "num_tokens": 1070581.0, | |
| "reward": 2.306253433227539, | |
| "reward_std": 1.3609917163848877, | |
| "rewards/rollout_reward_func/mean": 2.306253433227539, | |
| "rewards/rollout_reward_func/std": 1.8414863348007202, | |
| "sampling/importance_sampling_ratio/max": 0.05647118017077446, | |
| "sampling/importance_sampling_ratio/mean": 0.019686147570610046, | |
| "sampling/importance_sampling_ratio/min": 8.113022520378068e-17, | |
| "sampling/sampling_logp_difference/max": 5.106669902801514, | |
| "sampling/sampling_logp_difference/mean": 1.3194385766983032, | |
| "step": 41, | |
| "step_time": 9.059421735999877 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0028409091755747795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 6.697398841381073, | |
| "epoch": 0.00042, | |
| "grad_norm": 0.012766940519213676, | |
| "kl": 0.16542547149583697, | |
| "learning_rate": 9.999999995835533e-06, | |
| "loss": -0.0026, | |
| "step": 42, | |
| "step_time": 4.873758401001396 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 731.0, | |
| "completions/max_terminated_length": 731.0, | |
| "completions/mean_length": 368.4375, | |
| "completions/mean_terminated_length": 368.4375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.104367315769196, | |
| "epoch": 0.00043, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014938808046281338, | |
| "kl": 0.23283970914781094, | |
| "learning_rate": 9.999999994331697e-06, | |
| "loss": -0.0029, | |
| "num_tokens": 1121352.0, | |
| "reward": 2.358142614364624, | |
| "reward_std": 1.2651467323303223, | |
| "rewards/rollout_reward_func/mean": 2.358142614364624, | |
| "rewards/rollout_reward_func/std": 1.9181171655654907, | |
| "sampling/importance_sampling_ratio/max": 0.26942965388298035, | |
| "sampling/importance_sampling_ratio/mean": 0.06439891457557678, | |
| "sampling/importance_sampling_ratio/min": 0.00032740956521593034, | |
| "sampling/sampling_logp_difference/max": 2.4414572715759277, | |
| "sampling/sampling_logp_difference/mean": 1.2698404788970947, | |
| "step": 43, | |
| "step_time": 8.609952049000185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.033655643463135, | |
| "epoch": 0.00044, | |
| "grad_norm": 0.014094019308686256, | |
| "kl": 0.23375796806067228, | |
| "learning_rate": 9.999999992596503e-06, | |
| "loss": -0.003, | |
| "step": 44, | |
| "step_time": 5.138805102998958 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 769.0, | |
| "completions/max_terminated_length": 769.0, | |
| "completions/mean_length": 323.90625, | |
| "completions/mean_terminated_length": 315.8333435058594, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.381109952926636, | |
| "epoch": 0.00045, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015758151188492775, | |
| "kl": 0.1762851346284151, | |
| "learning_rate": 9.999999990629948e-06, | |
| "loss": -0.0067, | |
| "num_tokens": 1168932.0, | |
| "reward": 2.3062658309936523, | |
| "reward_std": 1.6873681545257568, | |
| "rewards/rollout_reward_func/mean": 2.3062658309936523, | |
| "rewards/rollout_reward_func/std": 2.015537738800049, | |
| "sampling/importance_sampling_ratio/max": 0.28785669803619385, | |
| "sampling/importance_sampling_ratio/mean": 0.07849664986133575, | |
| "sampling/importance_sampling_ratio/min": 1.9031297972719374e-18, | |
| "sampling/sampling_logp_difference/max": 4.624307155609131, | |
| "sampling/sampling_logp_difference/mean": 1.4169467687606812, | |
| "step": 45, | |
| "step_time": 9.268211923999843 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.324137270450592, | |
| "epoch": 0.00046, | |
| "grad_norm": 0.016143618151545525, | |
| "kl": 0.17720989137887955, | |
| "learning_rate": 9.999999988432035e-06, | |
| "loss": -0.0069, | |
| "step": 46, | |
| "step_time": 4.723417413000334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 640.0, | |
| "completions/max_terminated_length": 640.0, | |
| "completions/mean_length": 289.6875, | |
| "completions/mean_terminated_length": 289.6875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.798922121524811, | |
| "epoch": 0.00047, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022382335737347603, | |
| "kl": 0.45002966932952404, | |
| "learning_rate": 9.999999986002761e-06, | |
| "loss": -0.0059, | |
| "num_tokens": 1217968.0, | |
| "reward": 1.891412377357483, | |
| "reward_std": 1.739563226699829, | |
| "rewards/rollout_reward_func/mean": 1.891412377357483, | |
| "rewards/rollout_reward_func/std": 2.1437840461730957, | |
| "sampling/importance_sampling_ratio/max": 0.09762566536664963, | |
| "sampling/importance_sampling_ratio/mean": 0.04518824815750122, | |
| "sampling/importance_sampling_ratio/min": 4.220652438657879e-10, | |
| "sampling/sampling_logp_difference/max": 4.6860880851745605, | |
| "sampling/sampling_logp_difference/mean": 1.201680064201355, | |
| "step": 47, | |
| "step_time": 8.263996364999457 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0028409091755747795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 6.750116407871246, | |
| "epoch": 0.00048, | |
| "grad_norm": 0.015847016125917435, | |
| "kl": 0.3995134783908725, | |
| "learning_rate": 9.999999983342127e-06, | |
| "loss": -0.0061, | |
| "step": 48, | |
| "step_time": 4.558645490999879 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 758.0, | |
| "completions/max_terminated_length": 758.0, | |
| "completions/mean_length": 548.90625, | |
| "completions/mean_terminated_length": 548.90625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.013215720653534, | |
| "epoch": 0.00049, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.021171115338802338, | |
| "kl": 0.3200267134234309, | |
| "learning_rate": 9.999999980450137e-06, | |
| "loss": -0.0033, | |
| "num_tokens": 1277645.0, | |
| "reward": 2.5847978591918945, | |
| "reward_std": 0.8504736423492432, | |
| "rewards/rollout_reward_func/mean": 2.5847978591918945, | |
| "rewards/rollout_reward_func/std": 2.012620687484741, | |
| "sampling/importance_sampling_ratio/max": 0.10617782175540924, | |
| "sampling/importance_sampling_ratio/mean": 0.057969365268945694, | |
| "sampling/importance_sampling_ratio/min": 0.0006385967135429382, | |
| "sampling/sampling_logp_difference/max": 2.5639331340789795, | |
| "sampling/sampling_logp_difference/mean": 0.9616080522537231, | |
| "step": 49, | |
| "step_time": 8.934477900000275 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.897445023059845, | |
| "epoch": 0.0005, | |
| "grad_norm": 0.018860990181565285, | |
| "kl": 0.33296194672584534, | |
| "learning_rate": 9.999999977326787e-06, | |
| "loss": -0.0034, | |
| "step": 50, | |
| "step_time": 5.351620988000377 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 537.46875, | |
| "completions/mean_terminated_length": 537.46875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.633717179298401, | |
| "epoch": 0.00051, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01536987628787756, | |
| "kl": 0.2537247408181429, | |
| "learning_rate": 9.999999973972076e-06, | |
| "loss": -0.0022, | |
| "num_tokens": 1337256.0, | |
| "reward": 2.237421989440918, | |
| "reward_std": 1.3681035041809082, | |
| "rewards/rollout_reward_func/mean": 2.237421989440918, | |
| "rewards/rollout_reward_func/std": 1.740581750869751, | |
| "sampling/importance_sampling_ratio/max": 0.1205218955874443, | |
| "sampling/importance_sampling_ratio/mean": 0.04655706137418747, | |
| "sampling/importance_sampling_ratio/min": 0.001190877752378583, | |
| "sampling/sampling_logp_difference/max": 2.396751880645752, | |
| "sampling/sampling_logp_difference/mean": 1.094531774520874, | |
| "step": 51, | |
| "step_time": 9.431871319000038 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 6.529919326305389, | |
| "epoch": 0.00052, | |
| "grad_norm": 0.010579893365502357, | |
| "kl": 0.26615126617252827, | |
| "learning_rate": 9.999999970386004e-06, | |
| "loss": -0.0023, | |
| "step": 52, | |
| "step_time": 4.8396601220001685 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 646.0, | |
| "completions/max_terminated_length": 646.0, | |
| "completions/mean_length": 324.125, | |
| "completions/mean_terminated_length": 324.125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.771689236164093, | |
| "epoch": 0.00053, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015091736800968647, | |
| "kl": 0.2611819123849273, | |
| "learning_rate": 9.999999966568576e-06, | |
| "loss": -0.011, | |
| "num_tokens": 1386134.0, | |
| "reward": 2.217728853225708, | |
| "reward_std": 2.024404287338257, | |
| "rewards/rollout_reward_func/mean": 2.217728853225708, | |
| "rewards/rollout_reward_func/std": 2.3452394008636475, | |
| "sampling/importance_sampling_ratio/max": 0.3583432734012604, | |
| "sampling/importance_sampling_ratio/mean": 0.08469430357217789, | |
| "sampling/importance_sampling_ratio/min": 1.4002454964024292e-14, | |
| "sampling/sampling_logp_difference/max": 4.249520301818848, | |
| "sampling/sampling_logp_difference/mean": 1.3450038433074951, | |
| "step": 53, | |
| "step_time": 8.218689654000627 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.760413706302643, | |
| "epoch": 0.00054, | |
| "grad_norm": 0.015123301185667515, | |
| "kl": 0.26832089852541685, | |
| "learning_rate": 9.999999962519787e-06, | |
| "loss": -0.0111, | |
| "step": 54, | |
| "step_time": 4.458178430999396 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 785.0, | |
| "completions/max_terminated_length": 785.0, | |
| "completions/mean_length": 401.15625, | |
| "completions/mean_terminated_length": 393.1290283203125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.331356406211853, | |
| "epoch": 0.00055, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.07398711144924164, | |
| "kl": 0.45008302945643663, | |
| "learning_rate": 9.999999958239642e-06, | |
| "loss": -0.0078, | |
| "num_tokens": 1439006.0, | |
| "reward": 1.4235717058181763, | |
| "reward_std": 1.2011210918426514, | |
| "rewards/rollout_reward_func/mean": 1.4235717058181763, | |
| "rewards/rollout_reward_func/std": 1.8165862560272217, | |
| "sampling/importance_sampling_ratio/max": 0.37091660499572754, | |
| "sampling/importance_sampling_ratio/mean": 0.08141454309225082, | |
| "sampling/importance_sampling_ratio/min": 5.328838854689677e-16, | |
| "sampling/sampling_logp_difference/max": 6.0843353271484375, | |
| "sampling/sampling_logp_difference/mean": 1.2536330223083496, | |
| "step": 55, | |
| "step_time": 8.796601195999756 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 6.339613497257233, | |
| "epoch": 0.00056, | |
| "grad_norm": 0.026661040261387825, | |
| "kl": 0.4182750675827265, | |
| "learning_rate": 9.999999953728133e-06, | |
| "loss": -0.0081, | |
| "step": 56, | |
| "step_time": 5.088884858999336 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02025462966412306, | |
| "clip_ratio/high_mean": 0.01012731483206153, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01012731483206153, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 436.9375, | |
| "completions/mean_terminated_length": 436.9375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.944717228412628, | |
| "epoch": 0.00057, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01468927413225174, | |
| "kl": 0.3506404645740986, | |
| "learning_rate": 9.999999948985266e-06, | |
| "loss": -0.0092, | |
| "num_tokens": 1493962.0, | |
| "reward": 1.8320198059082031, | |
| "reward_std": 0.8225011825561523, | |
| "rewards/rollout_reward_func/mean": 1.8320198059082031, | |
| "rewards/rollout_reward_func/std": 1.2535020112991333, | |
| "sampling/importance_sampling_ratio/max": 0.3948986232280731, | |
| "sampling/importance_sampling_ratio/mean": 0.11361236125230789, | |
| "sampling/importance_sampling_ratio/min": 1.1934993257971545e-26, | |
| "sampling/sampling_logp_difference/max": 11.387396812438965, | |
| "sampling/sampling_logp_difference/mean": 1.256805658340454, | |
| "step": 57, | |
| "step_time": 9.449572570999862 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004629629664123058, | |
| "clip_ratio/high_mean": 0.002314814832061529, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.002314814832061529, | |
| "entropy": 5.935160547494888, | |
| "epoch": 0.00058, | |
| "grad_norm": 0.015919912606477737, | |
| "kl": 0.3587344065308571, | |
| "learning_rate": 9.99999994401104e-06, | |
| "loss": -0.0093, | |
| "step": 58, | |
| "step_time": 4.746457023000403 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 751.0, | |
| "completions/max_terminated_length": 751.0, | |
| "completions/mean_length": 480.59375, | |
| "completions/mean_terminated_length": 480.59375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.848488986492157, | |
| "epoch": 0.00059, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011293532326817513, | |
| "kl": 0.3660230152308941, | |
| "learning_rate": 9.999999938805455e-06, | |
| "loss": -0.0115, | |
| "num_tokens": 1551299.0, | |
| "reward": 2.3304104804992676, | |
| "reward_std": 1.6922229528427124, | |
| "rewards/rollout_reward_func/mean": 2.3304104804992676, | |
| "rewards/rollout_reward_func/std": 1.9971063137054443, | |
| "sampling/importance_sampling_ratio/max": 0.1643364131450653, | |
| "sampling/importance_sampling_ratio/mean": 0.08756053447723389, | |
| "sampling/importance_sampling_ratio/min": 0.0002673097769729793, | |
| "sampling/sampling_logp_difference/max": 2.872664213180542, | |
| "sampling/sampling_logp_difference/mean": 0.9991017580032349, | |
| "step": 59, | |
| "step_time": 8.483788936999645 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.8199068903923035, | |
| "epoch": 0.0006, | |
| "grad_norm": 0.009261633269488811, | |
| "kl": 0.3585043679922819, | |
| "learning_rate": 9.999999933368511e-06, | |
| "loss": -0.0116, | |
| "step": 60, | |
| "step_time": 4.707603984000343 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 697.0, | |
| "completions/max_terminated_length": 697.0, | |
| "completions/mean_length": 475.6875, | |
| "completions/mean_terminated_length": 470.774169921875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.473706513643265, | |
| "epoch": 0.00061, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013257281854748726, | |
| "kl": 0.32057628221809864, | |
| "learning_rate": 9.999999927700208e-06, | |
| "loss": -0.0136, | |
| "num_tokens": 1608125.0, | |
| "reward": 3.2187983989715576, | |
| "reward_std": 1.597353458404541, | |
| "rewards/rollout_reward_func/mean": 3.2187983989715576, | |
| "rewards/rollout_reward_func/std": 2.062941551208496, | |
| "sampling/importance_sampling_ratio/max": 0.17399519681930542, | |
| "sampling/importance_sampling_ratio/mean": 0.09229454398155212, | |
| "sampling/importance_sampling_ratio/min": 5.896088738771565e-13, | |
| "sampling/sampling_logp_difference/max": 4.472219944000244, | |
| "sampling/sampling_logp_difference/mean": 1.0552775859832764, | |
| "step": 61, | |
| "step_time": 9.438191732000632 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 5.456495136022568, | |
| "epoch": 0.00062, | |
| "grad_norm": 0.01240911427885294, | |
| "kl": 0.32513533532619476, | |
| "learning_rate": 9.999999921800544e-06, | |
| "loss": -0.0137, | |
| "step": 62, | |
| "step_time": 4.807036896999307 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 733.0, | |
| "completions/max_terminated_length": 733.0, | |
| "completions/mean_length": 240.96875, | |
| "completions/mean_terminated_length": 226.9677276611328, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.3534833788871765, | |
| "epoch": 0.00063, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01809127815067768, | |
| "kl": 0.36241104267537594, | |
| "learning_rate": 9.999999915669521e-06, | |
| "loss": -0.0133, | |
| "num_tokens": 1652976.0, | |
| "reward": 2.7536964416503906, | |
| "reward_std": 1.9717084169387817, | |
| "rewards/rollout_reward_func/mean": 2.7536964416503906, | |
| "rewards/rollout_reward_func/std": 2.0205845832824707, | |
| "sampling/importance_sampling_ratio/max": 0.43795350193977356, | |
| "sampling/importance_sampling_ratio/mean": 0.12350660562515259, | |
| "sampling/importance_sampling_ratio/min": 5.083219400958683e-10, | |
| "sampling/sampling_logp_difference/max": 3.9410667419433594, | |
| "sampling/sampling_logp_difference/mean": 1.179007887840271, | |
| "step": 63, | |
| "step_time": 8.806214823998744 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07859848625957966, | |
| "clip_ratio/high_mean": 0.03929924312978983, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03929924312978983, | |
| "entropy": 6.309255123138428, | |
| "epoch": 0.00064, | |
| "grad_norm": 0.013196753337979317, | |
| "kl": 0.3604668825864792, | |
| "learning_rate": 9.99999990930714e-06, | |
| "loss": -0.0134, | |
| "step": 64, | |
| "step_time": 4.585649621999437 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 749.0, | |
| "completions/max_terminated_length": 749.0, | |
| "completions/mean_length": 520.15625, | |
| "completions/mean_terminated_length": 520.15625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.630438059568405, | |
| "epoch": 0.00065, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009761711582541466, | |
| "kl": 0.25976699963212013, | |
| "learning_rate": 9.999999902713398e-06, | |
| "loss": -0.0096, | |
| "num_tokens": 1710411.0, | |
| "reward": 2.439373731613159, | |
| "reward_std": 1.6874518394470215, | |
| "rewards/rollout_reward_func/mean": 2.439373731613159, | |
| "rewards/rollout_reward_func/std": 1.8576425313949585, | |
| "sampling/importance_sampling_ratio/max": 0.1951960325241089, | |
| "sampling/importance_sampling_ratio/mean": 0.06827103346586227, | |
| "sampling/importance_sampling_ratio/min": 1.1635305696700016e-07, | |
| "sampling/sampling_logp_difference/max": 4.405527591705322, | |
| "sampling/sampling_logp_difference/mean": 1.1796586513519287, | |
| "step": 65, | |
| "step_time": 8.827101629000026 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 6.571233093738556, | |
| "epoch": 0.00066, | |
| "grad_norm": 0.008613799698650837, | |
| "kl": 0.2607234949246049, | |
| "learning_rate": 9.999999895888298e-06, | |
| "loss": -0.0096, | |
| "step": 66, | |
| "step_time": 4.809835060999376 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 679.0, | |
| "completions/max_terminated_length": 679.0, | |
| "completions/mean_length": 301.1875, | |
| "completions/mean_terminated_length": 301.1875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.944601535797119, | |
| "epoch": 0.00067, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012496919371187687, | |
| "kl": 0.4631600920110941, | |
| "learning_rate": 9.99999988883184e-06, | |
| "loss": -0.0128, | |
| "num_tokens": 1759738.0, | |
| "reward": 1.9331152439117432, | |
| "reward_std": 1.1334162950515747, | |
| "rewards/rollout_reward_func/mean": 1.9331152439117432, | |
| "rewards/rollout_reward_func/std": 2.0543787479400635, | |
| "sampling/importance_sampling_ratio/max": 0.46345254778862, | |
| "sampling/importance_sampling_ratio/mean": 0.1404380202293396, | |
| "sampling/importance_sampling_ratio/min": 5.890969418942404e-07, | |
| "sampling/sampling_logp_difference/max": 4.386819839477539, | |
| "sampling/sampling_logp_difference/mean": 1.1129919290542603, | |
| "step": 67, | |
| "step_time": 8.765856677001011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.008928571827709675, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008928571827709675, | |
| "entropy": 5.895752668380737, | |
| "epoch": 0.00068, | |
| "grad_norm": 0.012568553909659386, | |
| "kl": 0.46327478997409344, | |
| "learning_rate": 9.999999881544019e-06, | |
| "loss": -0.0128, | |
| "step": 68, | |
| "step_time": 5.110902637000436 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 794.0, | |
| "completions/max_terminated_length": 794.0, | |
| "completions/mean_length": 452.34375, | |
| "completions/mean_terminated_length": 452.34375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.041722625494003, | |
| "epoch": 0.00069, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.07526283711194992, | |
| "kl": 0.6064412295818329, | |
| "learning_rate": 9.999999874024841e-06, | |
| "loss": -0.0146, | |
| "num_tokens": 1814594.0, | |
| "reward": 3.077542781829834, | |
| "reward_std": 1.2306993007659912, | |
| "rewards/rollout_reward_func/mean": 3.077542781829834, | |
| "rewards/rollout_reward_func/std": 1.810649037361145, | |
| "sampling/importance_sampling_ratio/max": 0.4706134796142578, | |
| "sampling/importance_sampling_ratio/mean": 0.165305495262146, | |
| "sampling/importance_sampling_ratio/min": 0.0011951870983466506, | |
| "sampling/sampling_logp_difference/max": 2.497037887573242, | |
| "sampling/sampling_logp_difference/mean": 0.7777004837989807, | |
| "step": 69, | |
| "step_time": 8.851037519998954 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 4.988219231367111, | |
| "epoch": 0.0007, | |
| "grad_norm": 0.20997123420238495, | |
| "kl": 0.8107541762292385, | |
| "learning_rate": 9.999999866274303e-06, | |
| "loss": -0.0143, | |
| "step": 70, | |
| "step_time": 4.902573127999858 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 697.0, | |
| "completions/max_terminated_length": 697.0, | |
| "completions/mean_length": 275.4375, | |
| "completions/mean_terminated_length": 265.8709716796875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.022567570209503, | |
| "epoch": 0.00071, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02304830029606819, | |
| "kl": 0.34742444939911366, | |
| "learning_rate": 9.999999858292407e-06, | |
| "loss": -0.0139, | |
| "num_tokens": 1861821.0, | |
| "reward": 2.4483556747436523, | |
| "reward_std": 1.4657219648361206, | |
| "rewards/rollout_reward_func/mean": 2.4483556747436523, | |
| "rewards/rollout_reward_func/std": 1.791189432144165, | |
| "sampling/importance_sampling_ratio/max": 0.47685861587524414, | |
| "sampling/importance_sampling_ratio/mean": 0.16110098361968994, | |
| "sampling/importance_sampling_ratio/min": 1.8858786060560462e-11, | |
| "sampling/sampling_logp_difference/max": 4.634000301361084, | |
| "sampling/sampling_logp_difference/mean": 1.0669944286346436, | |
| "step": 71, | |
| "step_time": 7.997900912999285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.069274663925171, | |
| "epoch": 0.00072, | |
| "grad_norm": 0.02166938968002796, | |
| "kl": 0.34302423894405365, | |
| "learning_rate": 9.99999985007915e-06, | |
| "loss": -0.0139, | |
| "step": 72, | |
| "step_time": 4.490313349000189 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 805.0, | |
| "completions/max_terminated_length": 805.0, | |
| "completions/mean_length": 488.6875, | |
| "completions/mean_terminated_length": 503.9354553222656, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.873573303222656, | |
| "epoch": 0.00073, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01142603438347578, | |
| "kl": 0.3447978002950549, | |
| "learning_rate": 9.999999841634535e-06, | |
| "loss": -0.0091, | |
| "num_tokens": 1916473.0, | |
| "reward": 2.781144857406616, | |
| "reward_std": 0.9423757791519165, | |
| "rewards/rollout_reward_func/mean": 2.781144857406616, | |
| "rewards/rollout_reward_func/std": 1.5482388734817505, | |
| "sampling/importance_sampling_ratio/max": 0.4879491329193115, | |
| "sampling/importance_sampling_ratio/mean": 0.12894627451896667, | |
| "sampling/importance_sampling_ratio/min": 5.110472808822486e-12, | |
| "sampling/sampling_logp_difference/max": 3.6776225566864014, | |
| "sampling/sampling_logp_difference/mean": 1.0262196063995361, | |
| "step": 73, | |
| "step_time": 8.777917193999201 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.9237218499183655, | |
| "epoch": 0.00074, | |
| "grad_norm": 0.011944163590669632, | |
| "kl": 0.3441983833909035, | |
| "learning_rate": 9.99999983295856e-06, | |
| "loss": -0.0091, | |
| "step": 74, | |
| "step_time": 5.257215922000341 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 635.0, | |
| "completions/max_terminated_length": 635.0, | |
| "completions/mean_length": 217.28125, | |
| "completions/mean_terminated_length": 206.09677124023438, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.533183515071869, | |
| "epoch": 0.00075, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.04438586160540581, | |
| "kl": 0.6188310664147139, | |
| "learning_rate": 9.999999824051225e-06, | |
| "loss": -0.0088, | |
| "num_tokens": 1961274.0, | |
| "reward": 2.8512258529663086, | |
| "reward_std": 0.732064962387085, | |
| "rewards/rollout_reward_func/mean": 2.8512258529663086, | |
| "rewards/rollout_reward_func/std": 1.670456051826477, | |
| "sampling/importance_sampling_ratio/max": 0.49958670139312744, | |
| "sampling/importance_sampling_ratio/mean": 0.21123701333999634, | |
| "sampling/importance_sampling_ratio/min": 3.237672987783241e-14, | |
| "sampling/sampling_logp_difference/max": 5.040909767150879, | |
| "sampling/sampling_logp_difference/mean": 1.135288953781128, | |
| "step": 75, | |
| "step_time": 7.92215164100071 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.010416666977107525, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 5.562716752290726, | |
| "epoch": 0.00076, | |
| "grad_norm": 0.02424151450395584, | |
| "kl": 0.565250052139163, | |
| "learning_rate": 9.999999814912531e-06, | |
| "loss": -0.0089, | |
| "step": 76, | |
| "step_time": 4.395662217000336 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 794.0, | |
| "completions/max_terminated_length": 794.0, | |
| "completions/mean_length": 291.0625, | |
| "completions/mean_terminated_length": 289.93548583984375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.875417947769165, | |
| "epoch": 0.00077, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015563178807497025, | |
| "kl": 0.41501787677407265, | |
| "learning_rate": 9.999999805542478e-06, | |
| "loss": -0.0118, | |
| "num_tokens": 2010634.0, | |
| "reward": 1.0790379047393799, | |
| "reward_std": 1.1985231637954712, | |
| "rewards/rollout_reward_func/mean": 1.0790379047393799, | |
| "rewards/rollout_reward_func/std": 1.5060786008834839, | |
| "sampling/importance_sampling_ratio/max": 0.5051907300949097, | |
| "sampling/importance_sampling_ratio/mean": 0.1884518414735794, | |
| "sampling/importance_sampling_ratio/min": 1.3433022472142397e-09, | |
| "sampling/sampling_logp_difference/max": 10.98376750946045, | |
| "sampling/sampling_logp_difference/mean": 1.1490110158920288, | |
| "step": 77, | |
| "step_time": 8.915195299999596 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0062500000931322575, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0062500000931322575, | |
| "entropy": 5.885697066783905, | |
| "epoch": 0.00078, | |
| "grad_norm": 0.014125452376902103, | |
| "kl": 0.4149230867624283, | |
| "learning_rate": 9.999999795941065e-06, | |
| "loss": -0.0119, | |
| "step": 78, | |
| "step_time": 5.149517803999061 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 794.0, | |
| "completions/max_terminated_length": 794.0, | |
| "completions/mean_length": 291.9375, | |
| "completions/mean_terminated_length": 300.8387145996094, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.519651710987091, | |
| "epoch": 0.00079, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.10713054239749908, | |
| "kl": 0.5323284231126308, | |
| "learning_rate": 9.999999786108293e-06, | |
| "loss": -0.0227, | |
| "num_tokens": 2059737.0, | |
| "reward": 1.5329793691635132, | |
| "reward_std": 0.5013623237609863, | |
| "rewards/rollout_reward_func/mean": 1.5329793691635132, | |
| "rewards/rollout_reward_func/std": 1.3323529958724976, | |
| "sampling/importance_sampling_ratio/max": 0.5639442801475525, | |
| "sampling/importance_sampling_ratio/mean": 0.25227874517440796, | |
| "sampling/importance_sampling_ratio/min": 2.7577478468139224e-14, | |
| "sampling/sampling_logp_difference/max": 3.539957046508789, | |
| "sampling/sampling_logp_difference/mean": 1.098191261291504, | |
| "step": 79, | |
| "step_time": 8.723111569000139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.00631313119083643, | |
| "clip_ratio/low_min": 0.005681818351149559, | |
| "clip_ratio/region_mean": 0.01412563119083643, | |
| "entropy": 5.470509052276611, | |
| "epoch": 0.0008, | |
| "grad_norm": 0.06396406888961792, | |
| "kl": 0.5383136495947838, | |
| "learning_rate": 9.999999776044163e-06, | |
| "loss": -0.0233, | |
| "step": 80, | |
| "step_time": 4.934400118000212 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 794.0, | |
| "completions/max_terminated_length": 794.0, | |
| "completions/mean_length": 384.1875, | |
| "completions/mean_terminated_length": 371.0322570800781, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.105532318353653, | |
| "epoch": 0.00081, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014338403008878231, | |
| "kl": 0.5314097702503204, | |
| "learning_rate": 9.999999765748672e-06, | |
| "loss": -0.0119, | |
| "num_tokens": 2109547.0, | |
| "reward": 3.0377540588378906, | |
| "reward_std": 1.2538565397262573, | |
| "rewards/rollout_reward_func/mean": 3.0377540588378906, | |
| "rewards/rollout_reward_func/std": 1.6159894466400146, | |
| "sampling/importance_sampling_ratio/max": 0.5120093822479248, | |
| "sampling/importance_sampling_ratio/mean": 0.18446674942970276, | |
| "sampling/importance_sampling_ratio/min": 3.6131722613852446e-11, | |
| "sampling/sampling_logp_difference/max": 3.793142080307007, | |
| "sampling/sampling_logp_difference/mean": 0.8700344562530518, | |
| "step": 81, | |
| "step_time": 8.634114757999669 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0031250000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250000465661287, | |
| "entropy": 5.07496240735054, | |
| "epoch": 0.00082, | |
| "grad_norm": 0.012873583473265171, | |
| "kl": 0.5282706655561924, | |
| "learning_rate": 9.999999755221823e-06, | |
| "loss": -0.012, | |
| "step": 82, | |
| "step_time": 4.737106287001097 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 803.0, | |
| "completions/max_terminated_length": 803.0, | |
| "completions/mean_length": 284.125, | |
| "completions/mean_terminated_length": 284.125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.206637322902679, | |
| "epoch": 0.00083, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02513122372329235, | |
| "kl": 0.5926351137459278, | |
| "learning_rate": 9.999999744463613e-06, | |
| "loss": -0.0183, | |
| "num_tokens": 2156768.0, | |
| "reward": 2.840292453765869, | |
| "reward_std": 1.3401542901992798, | |
| "rewards/rollout_reward_func/mean": 2.840292453765869, | |
| "rewards/rollout_reward_func/std": 1.8102028369903564, | |
| "sampling/importance_sampling_ratio/max": 0.520283579826355, | |
| "sampling/importance_sampling_ratio/mean": 0.2087681144475937, | |
| "sampling/importance_sampling_ratio/min": 2.49877535329901e-10, | |
| "sampling/sampling_logp_difference/max": 4.733933448791504, | |
| "sampling/sampling_logp_difference/mean": 0.9762767553329468, | |
| "step": 83, | |
| "step_time": 8.479427639000278 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.208840876817703, | |
| "epoch": 0.00084, | |
| "grad_norm": 0.03026541694998741, | |
| "kl": 0.6035371646285057, | |
| "learning_rate": 9.999999733474045e-06, | |
| "loss": -0.0183, | |
| "step": 84, | |
| "step_time": 5.254075981999904 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 848.0, | |
| "completions/max_terminated_length": 848.0, | |
| "completions/mean_length": 450.0, | |
| "completions/mean_terminated_length": 464.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.834830671548843, | |
| "epoch": 0.00085, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012555217370390892, | |
| "kl": 0.3606609106063843, | |
| "learning_rate": 9.999999722253117e-06, | |
| "loss": -0.0117, | |
| "num_tokens": 2211652.0, | |
| "reward": 1.648930311203003, | |
| "reward_std": 1.215954065322876, | |
| "rewards/rollout_reward_func/mean": 1.648930311203003, | |
| "rewards/rollout_reward_func/std": 1.3405512571334839, | |
| "sampling/importance_sampling_ratio/max": 0.5233410000801086, | |
| "sampling/importance_sampling_ratio/mean": 0.13886834681034088, | |
| "sampling/importance_sampling_ratio/min": 1.688752483300254e-16, | |
| "sampling/sampling_logp_difference/max": 3.9144070148468018, | |
| "sampling/sampling_logp_difference/mean": 1.1154017448425293, | |
| "step": 85, | |
| "step_time": 9.084167009999419 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 5.833049297332764, | |
| "epoch": 0.00086, | |
| "grad_norm": 0.012575902976095676, | |
| "kl": 0.3626660779118538, | |
| "learning_rate": 9.99999971080083e-06, | |
| "loss": -0.0118, | |
| "step": 86, | |
| "step_time": 5.319575449999775 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 722.0, | |
| "completions/max_terminated_length": 722.0, | |
| "completions/mean_length": 235.9375, | |
| "completions/mean_terminated_length": 243.03225708007812, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.763930678367615, | |
| "epoch": 0.00087, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014682337641716003, | |
| "kl": 0.39749570190906525, | |
| "learning_rate": 9.999999699117184e-06, | |
| "loss": -0.0146, | |
| "num_tokens": 2257453.0, | |
| "reward": 2.5003364086151123, | |
| "reward_std": 1.9458928108215332, | |
| "rewards/rollout_reward_func/mean": 2.5003364086151123, | |
| "rewards/rollout_reward_func/std": 2.2692220211029053, | |
| "sampling/importance_sampling_ratio/max": 0.5295130610466003, | |
| "sampling/importance_sampling_ratio/mean": 0.11728590726852417, | |
| "sampling/importance_sampling_ratio/min": 3.7461741065995813e-13, | |
| "sampling/sampling_logp_difference/max": 3.0392837524414062, | |
| "sampling/sampling_logp_difference/mean": 1.4054160118103027, | |
| "step": 87, | |
| "step_time": 8.034342769000887 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.757734656333923, | |
| "epoch": 0.00088, | |
| "grad_norm": 0.013116477057337761, | |
| "kl": 0.38789064437150955, | |
| "learning_rate": 9.999999687202177e-06, | |
| "loss": -0.0146, | |
| "step": 88, | |
| "step_time": 4.444944719000432 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 755.0, | |
| "completions/max_terminated_length": 741.0, | |
| "completions/mean_length": 447.34375, | |
| "completions/mean_terminated_length": 451.4667053222656, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.2215496301651, | |
| "epoch": 0.00089, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013803564012050629, | |
| "kl": 0.5341093055903912, | |
| "learning_rate": 9.999999675055814e-06, | |
| "loss": -0.0177, | |
| "num_tokens": 2312250.0, | |
| "reward": 1.7047960758209229, | |
| "reward_std": 1.1020748615264893, | |
| "rewards/rollout_reward_func/mean": 1.7047960758209229, | |
| "rewards/rollout_reward_func/std": 1.8067169189453125, | |
| "sampling/importance_sampling_ratio/max": 0.5259312391281128, | |
| "sampling/importance_sampling_ratio/mean": 0.17643724381923676, | |
| "sampling/importance_sampling_ratio/min": 2.5637248400600665e-12, | |
| "sampling/sampling_logp_difference/max": 5.182948589324951, | |
| "sampling/sampling_logp_difference/mean": 1.0466536283493042, | |
| "step": 89, | |
| "step_time": 8.874949301000925 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.204747676849365, | |
| "epoch": 0.0009, | |
| "grad_norm": 0.01240418292582035, | |
| "kl": 0.5262163020670414, | |
| "learning_rate": 9.999999662678088e-06, | |
| "loss": -0.0177, | |
| "step": 90, | |
| "step_time": 5.186326979000114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 558.34375, | |
| "completions/mean_terminated_length": 558.34375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.52349066734314, | |
| "epoch": 0.00091, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.010751481167972088, | |
| "kl": 0.408858347684145, | |
| "learning_rate": 9.999999650069006e-06, | |
| "loss": -0.0171, | |
| "num_tokens": 2371581.0, | |
| "reward": 1.3657488822937012, | |
| "reward_std": 1.0520522594451904, | |
| "rewards/rollout_reward_func/mean": 1.3657488822937012, | |
| "rewards/rollout_reward_func/std": 1.2561410665512085, | |
| "sampling/importance_sampling_ratio/max": 0.2838555574417114, | |
| "sampling/importance_sampling_ratio/mean": 0.12384120374917984, | |
| "sampling/importance_sampling_ratio/min": 4.310294829390493e-11, | |
| "sampling/sampling_logp_difference/max": 3.8953909873962402, | |
| "sampling/sampling_logp_difference/mean": 0.9854850769042969, | |
| "step": 91, | |
| "step_time": 9.148271317999388 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 5.50886458158493, | |
| "epoch": 0.00092, | |
| "grad_norm": 0.011888951063156128, | |
| "kl": 0.4073612429201603, | |
| "learning_rate": 9.999999637228563e-06, | |
| "loss": -0.0171, | |
| "step": 92, | |
| "step_time": 5.326846187999763 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 751.0, | |
| "completions/max_terminated_length": 751.0, | |
| "completions/mean_length": 254.34375, | |
| "completions/mean_terminated_length": 254.34375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.463173568248749, | |
| "epoch": 0.00093, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011303669773042202, | |
| "kl": 0.4870901219546795, | |
| "learning_rate": 9.99999962415676e-06, | |
| "loss": -0.0089, | |
| "num_tokens": 2416625.0, | |
| "reward": 2.9061293601989746, | |
| "reward_std": 1.1144578456878662, | |
| "rewards/rollout_reward_func/mean": 2.9061293601989746, | |
| "rewards/rollout_reward_func/std": 1.872575283050537, | |
| "sampling/importance_sampling_ratio/max": 0.538914144039154, | |
| "sampling/importance_sampling_ratio/mean": 0.23089367151260376, | |
| "sampling/importance_sampling_ratio/min": 0.001189270755276084, | |
| "sampling/sampling_logp_difference/max": 2.552368640899658, | |
| "sampling/sampling_logp_difference/mean": 0.8883153200149536, | |
| "step": 93, | |
| "step_time": 8.310906286000318 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.453524351119995, | |
| "epoch": 0.00094, | |
| "grad_norm": 0.011560036800801754, | |
| "kl": 0.49043361097574234, | |
| "learning_rate": 9.999999610853598e-06, | |
| "loss": -0.009, | |
| "step": 94, | |
| "step_time": 4.578007039998738 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 695.0, | |
| "completions/max_terminated_length": 695.0, | |
| "completions/mean_length": 323.0, | |
| "completions/mean_terminated_length": 323.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.127210021018982, | |
| "epoch": 0.00095, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016390182077884674, | |
| "kl": 0.40985831432044506, | |
| "learning_rate": 9.999999597319077e-06, | |
| "loss": -0.0144, | |
| "num_tokens": 2467574.0, | |
| "reward": 2.420485019683838, | |
| "reward_std": 1.8477942943572998, | |
| "rewards/rollout_reward_func/mean": 2.420485019683838, | |
| "rewards/rollout_reward_func/std": 2.099099636077881, | |
| "sampling/importance_sampling_ratio/max": 0.5398046970367432, | |
| "sampling/importance_sampling_ratio/mean": 0.14346127212047577, | |
| "sampling/importance_sampling_ratio/min": 7.899796268528991e-12, | |
| "sampling/sampling_logp_difference/max": 3.005741596221924, | |
| "sampling/sampling_logp_difference/mean": 1.1369695663452148, | |
| "step": 95, | |
| "step_time": 9.084458965998692 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.112871825695038, | |
| "epoch": 0.00096, | |
| "grad_norm": 0.015214670449495316, | |
| "kl": 0.4128855764865875, | |
| "learning_rate": 9.999999583553198e-06, | |
| "loss": -0.0144, | |
| "step": 96, | |
| "step_time": 4.6464639670002725 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 404.1875, | |
| "completions/mean_terminated_length": 413.0000305175781, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.4772584438323975, | |
| "epoch": 0.00097, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013531319797039032, | |
| "kl": 0.33689754270017147, | |
| "learning_rate": 9.999999569555958e-06, | |
| "loss": -0.0195, | |
| "num_tokens": 2519297.0, | |
| "reward": 1.2118922472000122, | |
| "reward_std": 0.9071276187896729, | |
| "rewards/rollout_reward_func/mean": 1.2118922472000122, | |
| "rewards/rollout_reward_func/std": 1.6393996477127075, | |
| "sampling/importance_sampling_ratio/max": 0.5455878376960754, | |
| "sampling/importance_sampling_ratio/mean": 0.12733827531337738, | |
| "sampling/importance_sampling_ratio/min": 3.602854342990569e-12, | |
| "sampling/sampling_logp_difference/max": 4.47752046585083, | |
| "sampling/sampling_logp_difference/mean": 1.2203165292739868, | |
| "step": 97, | |
| "step_time": 9.439923181000268 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.469627916812897, | |
| "epoch": 0.00098, | |
| "grad_norm": 0.011723111383616924, | |
| "kl": 0.3316629286855459, | |
| "learning_rate": 9.99999955532736e-06, | |
| "loss": -0.0196, | |
| "step": 98, | |
| "step_time": 4.828754623999885 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 848.0, | |
| "completions/max_terminated_length": 848.0, | |
| "completions/mean_length": 536.90625, | |
| "completions/mean_terminated_length": 536.90625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.867543816566467, | |
| "epoch": 0.00099, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00992999505251646, | |
| "kl": 0.35577549040317535, | |
| "learning_rate": 9.999999540867401e-06, | |
| "loss": -0.0137, | |
| "num_tokens": 2578290.0, | |
| "reward": 2.7717137336730957, | |
| "reward_std": 1.5454163551330566, | |
| "rewards/rollout_reward_func/mean": 2.7717137336730957, | |
| "rewards/rollout_reward_func/std": 2.1133692264556885, | |
| "sampling/importance_sampling_ratio/max": 0.2941095232963562, | |
| "sampling/importance_sampling_ratio/mean": 0.1273624747991562, | |
| "sampling/importance_sampling_ratio/min": 3.663484793303695e-10, | |
| "sampling/sampling_logp_difference/max": 2.9304747581481934, | |
| "sampling/sampling_logp_difference/mean": 1.0121349096298218, | |
| "step": 99, | |
| "step_time": 9.026237381999636 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 5.866902112960815, | |
| "epoch": 0.001, | |
| "grad_norm": 0.008921584114432335, | |
| "kl": 0.3485883306711912, | |
| "learning_rate": 9.999999526176084e-06, | |
| "loss": -0.0137, | |
| "step": 100, | |
| "step_time": 4.884540873000333 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 318.6875, | |
| "completions/mean_terminated_length": 318.6875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.43700635433197, | |
| "epoch": 0.00101, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006189876701682806, | |
| "kl": 0.5527824461460114, | |
| "learning_rate": 9.999999511253408e-06, | |
| "loss": -0.0067, | |
| "num_tokens": 2626871.0, | |
| "reward": 2.009554862976074, | |
| "reward_std": 0.6518650054931641, | |
| "rewards/rollout_reward_func/mean": 2.009554862976074, | |
| "rewards/rollout_reward_func/std": 1.5595077276229858, | |
| "sampling/importance_sampling_ratio/max": 0.5488349199295044, | |
| "sampling/importance_sampling_ratio/mean": 0.19652444124221802, | |
| "sampling/importance_sampling_ratio/min": 0.0011575144017115235, | |
| "sampling/sampling_logp_difference/max": 2.923558473587036, | |
| "sampling/sampling_logp_difference/mean": 0.9920997619628906, | |
| "step": 101, | |
| "step_time": 9.15831846600031 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 5.427984565496445, | |
| "epoch": 0.00102, | |
| "grad_norm": 0.006101899314671755, | |
| "kl": 0.556145828217268, | |
| "learning_rate": 9.99999949609937e-06, | |
| "loss": -0.0067, | |
| "step": 102, | |
| "step_time": 4.759649325001192 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 721.0, | |
| "completions/max_terminated_length": 721.0, | |
| "completions/mean_length": 266.03125, | |
| "completions/mean_terminated_length": 257.8709716796875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.900549799203873, | |
| "epoch": 0.00103, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.007550720125436783, | |
| "kl": 0.46073490381240845, | |
| "learning_rate": 9.999999480713976e-06, | |
| "loss": -0.0124, | |
| "num_tokens": 2674005.0, | |
| "reward": 3.7651147842407227, | |
| "reward_std": 1.2847379446029663, | |
| "rewards/rollout_reward_func/mean": 3.7651147842407227, | |
| "rewards/rollout_reward_func/std": 1.5383661985397339, | |
| "sampling/importance_sampling_ratio/max": 0.5506238341331482, | |
| "sampling/importance_sampling_ratio/mean": 0.26077592372894287, | |
| "sampling/importance_sampling_ratio/min": 9.013636733702646e-14, | |
| "sampling/sampling_logp_difference/max": 3.8743910789489746, | |
| "sampling/sampling_logp_difference/mean": 0.9717509150505066, | |
| "step": 103, | |
| "step_time": 8.535875374000625 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.909915745258331, | |
| "epoch": 0.00104, | |
| "grad_norm": 0.007524185813963413, | |
| "kl": 0.45878610014915466, | |
| "learning_rate": 9.99999946509722e-06, | |
| "loss": -0.0124, | |
| "step": 104, | |
| "step_time": 4.519358364999334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 694.0, | |
| "completions/max_terminated_length": 694.0, | |
| "completions/mean_length": 263.75, | |
| "completions/mean_terminated_length": 271.7419128417969, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.765733242034912, | |
| "epoch": 0.00105, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006037894636392593, | |
| "kl": 0.5730921756476164, | |
| "learning_rate": 9.999999449249107e-06, | |
| "loss": -0.0164, | |
| "num_tokens": 2719481.0, | |
| "reward": 3.5972437858581543, | |
| "reward_std": 1.0879021883010864, | |
| "rewards/rollout_reward_func/mean": 3.5972437858581543, | |
| "rewards/rollout_reward_func/std": 1.4054116010665894, | |
| "sampling/importance_sampling_ratio/max": 0.5536695122718811, | |
| "sampling/importance_sampling_ratio/mean": 0.25917497277259827, | |
| "sampling/importance_sampling_ratio/min": 7.426475804095389e-06, | |
| "sampling/sampling_logp_difference/max": 3.8071212768554688, | |
| "sampling/sampling_logp_difference/mean": 0.7888709902763367, | |
| "step": 105, | |
| "step_time": 7.902388452000196 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.764300674200058, | |
| "epoch": 0.00106, | |
| "grad_norm": 0.006110228598117828, | |
| "kl": 0.5746921207755804, | |
| "learning_rate": 9.999999433169634e-06, | |
| "loss": -0.0165, | |
| "step": 106, | |
| "step_time": 4.451162388999819 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 796.0, | |
| "completions/max_terminated_length": 796.0, | |
| "completions/mean_length": 504.03125, | |
| "completions/mean_terminated_length": 504.03125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.259044528007507, | |
| "epoch": 0.00107, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.09284374862909317, | |
| "kl": 0.5678669139742851, | |
| "learning_rate": 9.999999416858801e-06, | |
| "loss": -0.023, | |
| "num_tokens": 2778110.0, | |
| "reward": 3.0518195629119873, | |
| "reward_std": 2.0326967239379883, | |
| "rewards/rollout_reward_func/mean": 3.0518195629119873, | |
| "rewards/rollout_reward_func/std": 2.1500182151794434, | |
| "sampling/importance_sampling_ratio/max": 0.3023638129234314, | |
| "sampling/importance_sampling_ratio/mean": 0.15813782811164856, | |
| "sampling/importance_sampling_ratio/min": 0.0001912050211103633, | |
| "sampling/sampling_logp_difference/max": 2.7088003158569336, | |
| "sampling/sampling_logp_difference/mean": 0.8632931709289551, | |
| "step": 107, | |
| "step_time": 9.265566470999602 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.301291227340698, | |
| "epoch": 0.00108, | |
| "grad_norm": 0.0192717295140028, | |
| "kl": 0.40367304906249046, | |
| "learning_rate": 9.999999400316609e-06, | |
| "loss": -0.0233, | |
| "step": 108, | |
| "step_time": 4.872972249000213 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 785.0, | |
| "completions/max_terminated_length": 785.0, | |
| "completions/mean_length": 453.8125, | |
| "completions/mean_terminated_length": 454.06451416015625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.344372421503067, | |
| "epoch": 0.00109, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005915234796702862, | |
| "kl": 0.5280559062957764, | |
| "learning_rate": 9.999999383543059e-06, | |
| "loss": -0.0117, | |
| "num_tokens": 2832092.0, | |
| "reward": 2.0643255710601807, | |
| "reward_std": 0.36830055713653564, | |
| "rewards/rollout_reward_func/mean": 2.0643255710601807, | |
| "rewards/rollout_reward_func/std": 1.2795445919036865, | |
| "sampling/importance_sampling_ratio/max": 0.5460331439971924, | |
| "sampling/importance_sampling_ratio/mean": 0.2571074366569519, | |
| "sampling/importance_sampling_ratio/min": 3.3921960648563643e-15, | |
| "sampling/sampling_logp_difference/max": 4.154726505279541, | |
| "sampling/sampling_logp_difference/mean": 0.7197229862213135, | |
| "step": 109, | |
| "step_time": 9.136753535999105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.370859324932098, | |
| "epoch": 0.0011, | |
| "grad_norm": 0.006120497360825539, | |
| "kl": 0.5247279852628708, | |
| "learning_rate": 9.999999366538148e-06, | |
| "loss": -0.0117, | |
| "step": 110, | |
| "step_time": 4.750533235999228 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 704.0, | |
| "completions/max_terminated_length": 704.0, | |
| "completions/mean_length": 468.09375, | |
| "completions/mean_terminated_length": 468.09375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.781158804893494, | |
| "epoch": 0.00111, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009389814920723438, | |
| "kl": 0.40754758939146996, | |
| "learning_rate": 9.999999349301878e-06, | |
| "loss": -0.0128, | |
| "num_tokens": 2887456.0, | |
| "reward": 2.372213840484619, | |
| "reward_std": 1.1804518699645996, | |
| "rewards/rollout_reward_func/mean": 2.372213840484619, | |
| "rewards/rollout_reward_func/std": 1.6503901481628418, | |
| "sampling/importance_sampling_ratio/max": 0.5524845123291016, | |
| "sampling/importance_sampling_ratio/mean": 0.1906067430973053, | |
| "sampling/importance_sampling_ratio/min": 4.788781764827768e-16, | |
| "sampling/sampling_logp_difference/max": 3.3403866291046143, | |
| "sampling/sampling_logp_difference/mean": 1.0884251594543457, | |
| "step": 111, | |
| "step_time": 8.606538135000847 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.815384209156036, | |
| "epoch": 0.00112, | |
| "grad_norm": 0.01099073514342308, | |
| "kl": 0.40177351236343384, | |
| "learning_rate": 9.999999331834249e-06, | |
| "loss": -0.0128, | |
| "step": 112, | |
| "step_time": 5.018896831999882 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 722.0, | |
| "completions/max_terminated_length": 722.0, | |
| "completions/mean_length": 444.90625, | |
| "completions/mean_terminated_length": 444.90625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.825027763843536, | |
| "epoch": 0.00113, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011304070241749287, | |
| "kl": 0.35968483053147793, | |
| "learning_rate": 9.99999931413526e-06, | |
| "loss": -0.0059, | |
| "num_tokens": 2943025.0, | |
| "reward": 2.8332765102386475, | |
| "reward_std": 1.7419302463531494, | |
| "rewards/rollout_reward_func/mean": 2.8332765102386475, | |
| "rewards/rollout_reward_func/std": 1.946202278137207, | |
| "sampling/importance_sampling_ratio/max": 0.30486950278282166, | |
| "sampling/importance_sampling_ratio/mean": 0.14108777046203613, | |
| "sampling/importance_sampling_ratio/min": 0.00019812805112451315, | |
| "sampling/sampling_logp_difference/max": 2.6881039142608643, | |
| "sampling/sampling_logp_difference/mean": 1.0696028470993042, | |
| "step": 113, | |
| "step_time": 8.58645070500097 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.8290793895721436, | |
| "epoch": 0.00114, | |
| "grad_norm": 0.011425897479057312, | |
| "kl": 0.3614608943462372, | |
| "learning_rate": 9.999999296204912e-06, | |
| "loss": -0.0059, | |
| "step": 114, | |
| "step_time": 5.139510378000523 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 235.75, | |
| "completions/mean_terminated_length": 235.75, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.922475636005402, | |
| "epoch": 0.00115, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.023028887808322906, | |
| "kl": 0.4135777149349451, | |
| "learning_rate": 9.999999278043205e-06, | |
| "loss": -0.0119, | |
| "num_tokens": 2988835.0, | |
| "reward": 2.040597438812256, | |
| "reward_std": 0.6493960618972778, | |
| "rewards/rollout_reward_func/mean": 2.040597438812256, | |
| "rewards/rollout_reward_func/std": 1.5975624322891235, | |
| "sampling/importance_sampling_ratio/max": 0.5552006363868713, | |
| "sampling/importance_sampling_ratio/mean": 0.2123653143644333, | |
| "sampling/importance_sampling_ratio/min": 1.9730843694998335e-11, | |
| "sampling/sampling_logp_difference/max": 3.120391845703125, | |
| "sampling/sampling_logp_difference/mean": 1.0702314376831055, | |
| "step": 115, | |
| "step_time": 8.227128244000596 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.934574216604233, | |
| "epoch": 0.00116, | |
| "grad_norm": 0.024487733840942383, | |
| "kl": 0.4113778416067362, | |
| "learning_rate": 9.99999925965014e-06, | |
| "loss": -0.0118, | |
| "step": 116, | |
| "step_time": 4.603303872000197 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0031250000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250000465661287, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 446.0, | |
| "completions/mean_terminated_length": 459.8709411621094, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.283336162567139, | |
| "epoch": 0.00117, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.06837444752454758, | |
| "kl": 0.7454761080443859, | |
| "learning_rate": 9.999999241025713e-06, | |
| "loss": -0.0137, | |
| "num_tokens": 3044815.0, | |
| "reward": 1.6962122917175293, | |
| "reward_std": 0.9731729030609131, | |
| "rewards/rollout_reward_func/mean": 1.6962122917175293, | |
| "rewards/rollout_reward_func/std": 1.472454309463501, | |
| "sampling/importance_sampling_ratio/max": 0.5520793199539185, | |
| "sampling/importance_sampling_ratio/mean": 0.1955501139163971, | |
| "sampling/importance_sampling_ratio/min": 8.385171321124898e-15, | |
| "sampling/sampling_logp_difference/max": 3.5139248371124268, | |
| "sampling/sampling_logp_difference/mean": 1.0980048179626465, | |
| "step": 117, | |
| "step_time": 8.851413534999665 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.010416666977107525, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 5.293146431446075, | |
| "epoch": 0.00118, | |
| "grad_norm": 0.025546826422214508, | |
| "kl": 0.6223187446594238, | |
| "learning_rate": 9.99999922216993e-06, | |
| "loss": -0.014, | |
| "step": 118, | |
| "step_time": 5.2780849090008815 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 477.75, | |
| "completions/mean_terminated_length": 498.20001220703125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.602368354797363, | |
| "epoch": 0.00119, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01806759461760521, | |
| "kl": 0.340764039196074, | |
| "learning_rate": 9.999999203082784e-06, | |
| "loss": -0.0204, | |
| "num_tokens": 3099071.0, | |
| "reward": 1.5769758224487305, | |
| "reward_std": 1.3088091611862183, | |
| "rewards/rollout_reward_func/mean": 1.5769758224487305, | |
| "rewards/rollout_reward_func/std": 1.6474065780639648, | |
| "sampling/importance_sampling_ratio/max": 0.5519512295722961, | |
| "sampling/importance_sampling_ratio/mean": 0.15655829012393951, | |
| "sampling/importance_sampling_ratio/min": 3.2326309328439702e-18, | |
| "sampling/sampling_logp_difference/max": 3.693403959274292, | |
| "sampling/sampling_logp_difference/mean": 1.4753286838531494, | |
| "step": 119, | |
| "step_time": 9.247204750000492 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.583022654056549, | |
| "epoch": 0.0012, | |
| "grad_norm": 0.016562430188059807, | |
| "kl": 0.33369210083037615, | |
| "learning_rate": 9.999999183764282e-06, | |
| "loss": -0.0205, | |
| "step": 120, | |
| "step_time": 5.400903367999945 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 473.59375, | |
| "completions/mean_terminated_length": 473.59375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.3363790810108185, | |
| "epoch": 0.00121, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005732807330787182, | |
| "kl": 0.3513603312894702, | |
| "learning_rate": 9.999999164214418e-06, | |
| "loss": -0.015, | |
| "num_tokens": 3154721.0, | |
| "reward": 2.565702199935913, | |
| "reward_std": 0.9166498184204102, | |
| "rewards/rollout_reward_func/mean": 2.565702199935913, | |
| "rewards/rollout_reward_func/std": 1.641287922859192, | |
| "sampling/importance_sampling_ratio/max": 0.5513830184936523, | |
| "sampling/importance_sampling_ratio/mean": 0.21081653237342834, | |
| "sampling/importance_sampling_ratio/min": 9.868382777611373e-10, | |
| "sampling/sampling_logp_difference/max": 4.323619365692139, | |
| "sampling/sampling_logp_difference/mean": 0.9557435512542725, | |
| "step": 121, | |
| "step_time": 8.598964995000188 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.306765913963318, | |
| "epoch": 0.00122, | |
| "grad_norm": 0.005753090605139732, | |
| "kl": 0.3516113171353936, | |
| "learning_rate": 9.999999144433197e-06, | |
| "loss": -0.015, | |
| "step": 122, | |
| "step_time": 4.775643616999787 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 866.0, | |
| "completions/max_terminated_length": 866.0, | |
| "completions/mean_length": 574.21875, | |
| "completions/mean_terminated_length": 574.21875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.031126022338867, | |
| "epoch": 0.00123, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014242246747016907, | |
| "kl": 0.27723702508956194, | |
| "learning_rate": 9.999999124420615e-06, | |
| "loss": -0.0165, | |
| "num_tokens": 3214228.0, | |
| "reward": 2.553722381591797, | |
| "reward_std": 1.49879789352417, | |
| "rewards/rollout_reward_func/mean": 2.553722381591797, | |
| "rewards/rollout_reward_func/std": 1.8885005712509155, | |
| "sampling/importance_sampling_ratio/max": 0.30776602029800415, | |
| "sampling/importance_sampling_ratio/mean": 0.13252753019332886, | |
| "sampling/importance_sampling_ratio/min": 1.6167473718305125e-13, | |
| "sampling/sampling_logp_difference/max": 3.7081410884857178, | |
| "sampling/sampling_logp_difference/mean": 1.0756418704986572, | |
| "step": 123, | |
| "step_time": 8.99680862499963 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 6.018238723278046, | |
| "epoch": 0.00124, | |
| "grad_norm": 0.008585439994931221, | |
| "kl": 0.27489931136369705, | |
| "learning_rate": 9.999999104176675e-06, | |
| "loss": -0.0165, | |
| "step": 124, | |
| "step_time": 5.400279564999892 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 787.0, | |
| "completions/max_terminated_length": 787.0, | |
| "completions/mean_length": 428.40625, | |
| "completions/mean_terminated_length": 441.70965576171875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.530455976724625, | |
| "epoch": 0.00125, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01981445588171482, | |
| "kl": 0.5246058944612741, | |
| "learning_rate": 9.999999083701375e-06, | |
| "loss": -0.0115, | |
| "num_tokens": 3268782.0, | |
| "reward": 2.442720651626587, | |
| "reward_std": 0.9873529672622681, | |
| "rewards/rollout_reward_func/mean": 2.442720651626587, | |
| "rewards/rollout_reward_func/std": 1.5368415117263794, | |
| "sampling/importance_sampling_ratio/max": 0.5591750144958496, | |
| "sampling/importance_sampling_ratio/mean": 0.1981470286846161, | |
| "sampling/importance_sampling_ratio/min": 6.880031043303685e-15, | |
| "sampling/sampling_logp_difference/max": 2.8299753665924072, | |
| "sampling/sampling_logp_difference/mean": 1.1136442422866821, | |
| "step": 125, | |
| "step_time": 8.792223250000006 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 5.526207149028778, | |
| "epoch": 0.00126, | |
| "grad_norm": 0.07942020893096924, | |
| "kl": 0.5783120766282082, | |
| "learning_rate": 9.999999062994716e-06, | |
| "loss": -0.0115, | |
| "step": 126, | |
| "step_time": 5.212345460999586 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011363636702299118, | |
| "clip_ratio/high_mean": 0.005681818351149559, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005681818351149559, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 751.0, | |
| "completions/max_terminated_length": 751.0, | |
| "completions/mean_length": 355.09375, | |
| "completions/mean_terminated_length": 355.09375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.586715936660767, | |
| "epoch": 0.00127, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01894986629486084, | |
| "kl": 0.36187045089900494, | |
| "learning_rate": 9.999999042056698e-06, | |
| "loss": -0.0093, | |
| "num_tokens": 3320961.0, | |
| "reward": 3.5986928939819336, | |
| "reward_std": 1.4135947227478027, | |
| "rewards/rollout_reward_func/mean": 3.5986928939819336, | |
| "rewards/rollout_reward_func/std": 1.7381529808044434, | |
| "sampling/importance_sampling_ratio/max": 0.5567580461502075, | |
| "sampling/importance_sampling_ratio/mean": 0.18505819141864777, | |
| "sampling/importance_sampling_ratio/min": 3.734913612349951e-18, | |
| "sampling/sampling_logp_difference/max": 3.9289333820343018, | |
| "sampling/sampling_logp_difference/mean": 1.1177394390106201, | |
| "step": 127, | |
| "step_time": 8.4797745940009 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.003289473708719015, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003289473708719015, | |
| "entropy": 5.59991979598999, | |
| "epoch": 0.00128, | |
| "grad_norm": 0.015152394771575928, | |
| "kl": 0.36163298040628433, | |
| "learning_rate": 9.99999902088732e-06, | |
| "loss": -0.0093, | |
| "step": 128, | |
| "step_time": 4.677577405001102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 812.0, | |
| "completions/max_terminated_length": 812.0, | |
| "completions/mean_length": 428.5, | |
| "completions/mean_terminated_length": 428.5, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.128687739372253, | |
| "epoch": 0.00129, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005386632867157459, | |
| "kl": 0.29446713998913765, | |
| "learning_rate": 9.999998999486583e-06, | |
| "loss": -0.0113, | |
| "num_tokens": 3373920.0, | |
| "reward": 3.073700428009033, | |
| "reward_std": 1.0783164501190186, | |
| "rewards/rollout_reward_func/mean": 3.073700428009033, | |
| "rewards/rollout_reward_func/std": 1.8588871955871582, | |
| "sampling/importance_sampling_ratio/max": 0.5572874546051025, | |
| "sampling/importance_sampling_ratio/mean": 0.1692725121974945, | |
| "sampling/importance_sampling_ratio/min": 0.00015941473247949034, | |
| "sampling/sampling_logp_difference/max": 2.785651922225952, | |
| "sampling/sampling_logp_difference/mean": 1.0858612060546875, | |
| "step": 129, | |
| "step_time": 8.688911320999068 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.14833676815033, | |
| "epoch": 0.0013, | |
| "grad_norm": 0.005992444232106209, | |
| "kl": 0.2919249450787902, | |
| "learning_rate": 9.999998977854486e-06, | |
| "loss": -0.0113, | |
| "step": 130, | |
| "step_time": 5.196040378998532 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 803.0, | |
| "completions/max_terminated_length": 803.0, | |
| "completions/mean_length": 425.46875, | |
| "completions/mean_terminated_length": 425.46875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.23365318775177, | |
| "epoch": 0.00131, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.019059518352150917, | |
| "kl": 0.34147817455232143, | |
| "learning_rate": 9.99999895599103e-06, | |
| "loss": -0.0035, | |
| "num_tokens": 3427401.0, | |
| "reward": 2.473796844482422, | |
| "reward_std": 1.8125438690185547, | |
| "rewards/rollout_reward_func/mean": 2.473796844482422, | |
| "rewards/rollout_reward_func/std": 1.8101236820220947, | |
| "sampling/importance_sampling_ratio/max": 0.5523518323898315, | |
| "sampling/importance_sampling_ratio/mean": 0.14591509103775024, | |
| "sampling/importance_sampling_ratio/min": 0.00020133046200498939, | |
| "sampling/sampling_logp_difference/max": 3.7536349296569824, | |
| "sampling/sampling_logp_difference/mean": 1.2340688705444336, | |
| "step": 131, | |
| "step_time": 8.596268926000448 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.220755457878113, | |
| "epoch": 0.00132, | |
| "grad_norm": 0.018474310636520386, | |
| "kl": 0.3436393868178129, | |
| "learning_rate": 9.999998933896215e-06, | |
| "loss": -0.0035, | |
| "step": 132, | |
| "step_time": 5.2319554900000185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 821.0, | |
| "completions/max_terminated_length": 821.0, | |
| "completions/mean_length": 448.71875, | |
| "completions/mean_terminated_length": 448.71875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.855165183544159, | |
| "epoch": 0.00133, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.021189430728554726, | |
| "kl": 0.4099526349455118, | |
| "learning_rate": 9.999998911570041e-06, | |
| "loss": -0.0112, | |
| "num_tokens": 3482090.0, | |
| "reward": 1.9857467412948608, | |
| "reward_std": 0.9780662655830383, | |
| "rewards/rollout_reward_func/mean": 1.9857467412948608, | |
| "rewards/rollout_reward_func/std": 1.4453747272491455, | |
| "sampling/importance_sampling_ratio/max": 0.5519795417785645, | |
| "sampling/importance_sampling_ratio/mean": 0.14946779608726501, | |
| "sampling/importance_sampling_ratio/min": 4.809954772476128e-20, | |
| "sampling/sampling_logp_difference/max": 13.282112121582031, | |
| "sampling/sampling_logp_difference/mean": 1.1622436046600342, | |
| "step": 133, | |
| "step_time": 8.62497339599986 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 5.8229920864105225, | |
| "epoch": 0.00134, | |
| "grad_norm": 0.02182932198047638, | |
| "kl": 0.4096921207383275, | |
| "learning_rate": 9.999998889012509e-06, | |
| "loss": -0.0112, | |
| "step": 134, | |
| "step_time": 4.7461931110005935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 731.0, | |
| "completions/max_terminated_length": 731.0, | |
| "completions/mean_length": 405.90625, | |
| "completions/mean_terminated_length": 405.90625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.744675666093826, | |
| "epoch": 0.00135, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.020258011296391487, | |
| "kl": 0.5242082104086876, | |
| "learning_rate": 9.999998866223617e-06, | |
| "loss": 0.0004, | |
| "num_tokens": 3535997.0, | |
| "reward": 2.09535551071167, | |
| "reward_std": 0.801947832107544, | |
| "rewards/rollout_reward_func/mean": 2.09535551071167, | |
| "rewards/rollout_reward_func/std": 1.5733643770217896, | |
| "sampling/importance_sampling_ratio/max": 0.5550893545150757, | |
| "sampling/importance_sampling_ratio/mean": 0.21865856647491455, | |
| "sampling/importance_sampling_ratio/min": 0.0004308926872909069, | |
| "sampling/sampling_logp_difference/max": 2.2621819972991943, | |
| "sampling/sampling_logp_difference/mean": 0.6881621479988098, | |
| "step": 135, | |
| "step_time": 8.649714739999581 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0078125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 4.7144655585289, | |
| "epoch": 0.00136, | |
| "grad_norm": 0.017525319010019302, | |
| "kl": 0.5250850170850754, | |
| "learning_rate": 9.999998843203364e-06, | |
| "loss": 0.0004, | |
| "step": 136, | |
| "step_time": 5.175107476999074 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 778.0, | |
| "completions/max_terminated_length": 778.0, | |
| "completions/mean_length": 373.84375, | |
| "completions/mean_terminated_length": 373.84375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.857703149318695, | |
| "epoch": 0.00137, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00757873198017478, | |
| "kl": 0.39663390070199966, | |
| "learning_rate": 9.999998819951753e-06, | |
| "loss": -0.0103, | |
| "num_tokens": 3587640.0, | |
| "reward": 3.1773321628570557, | |
| "reward_std": 1.37308669090271, | |
| "rewards/rollout_reward_func/mean": 3.1773321628570557, | |
| "rewards/rollout_reward_func/std": 1.8172297477722168, | |
| "sampling/importance_sampling_ratio/max": 0.5551076531410217, | |
| "sampling/importance_sampling_ratio/mean": 0.21167081594467163, | |
| "sampling/importance_sampling_ratio/min": 0.00045508454786613584, | |
| "sampling/sampling_logp_difference/max": 3.646714210510254, | |
| "sampling/sampling_logp_difference/mean": 0.8012743592262268, | |
| "step": 137, | |
| "step_time": 8.443151505000515 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.867439925670624, | |
| "epoch": 0.00138, | |
| "grad_norm": 0.007302064914256334, | |
| "kl": 0.39654075540602207, | |
| "learning_rate": 9.999998796468782e-06, | |
| "loss": -0.0102, | |
| "step": 138, | |
| "step_time": 5.160483013000885 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 496.53125, | |
| "completions/mean_terminated_length": 512.0322265625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.3140488266944885, | |
| "epoch": 0.00139, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.047983165830373764, | |
| "kl": 0.43244970217347145, | |
| "learning_rate": 9.999998772754452e-06, | |
| "loss": -0.0136, | |
| "num_tokens": 3642802.0, | |
| "reward": 2.342683792114258, | |
| "reward_std": 0.9078880548477173, | |
| "rewards/rollout_reward_func/mean": 2.342683792114258, | |
| "rewards/rollout_reward_func/std": 1.424059510231018, | |
| "sampling/importance_sampling_ratio/max": 0.5550379753112793, | |
| "sampling/importance_sampling_ratio/mean": 0.21025414764881134, | |
| "sampling/importance_sampling_ratio/min": 4.1330454277109865e-14, | |
| "sampling/sampling_logp_difference/max": 4.175216197967529, | |
| "sampling/sampling_logp_difference/mean": 0.9738059639930725, | |
| "step": 139, | |
| "step_time": 8.921468903000005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.2954131960868835, | |
| "epoch": 0.0014, | |
| "grad_norm": 0.03462895750999451, | |
| "kl": 0.43446778878569603, | |
| "learning_rate": 9.999998748808764e-06, | |
| "loss": -0.0138, | |
| "step": 140, | |
| "step_time": 4.874726669999291 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 364.84375, | |
| "completions/mean_terminated_length": 369.3333435058594, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.6628918051719666, | |
| "epoch": 0.00141, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.048540182411670685, | |
| "kl": 0.37702805921435356, | |
| "learning_rate": 9.999998724631715e-06, | |
| "loss": -0.0215, | |
| "num_tokens": 3694811.0, | |
| "reward": 2.8997886180877686, | |
| "reward_std": 1.3242307901382446, | |
| "rewards/rollout_reward_func/mean": 2.8997886180877686, | |
| "rewards/rollout_reward_func/std": 1.9665751457214355, | |
| "sampling/importance_sampling_ratio/max": 0.30592989921569824, | |
| "sampling/importance_sampling_ratio/mean": 0.13573986291885376, | |
| "sampling/importance_sampling_ratio/min": 5.063214828144886e-15, | |
| "sampling/sampling_logp_difference/max": 4.425807476043701, | |
| "sampling/sampling_logp_difference/mean": 1.0662561655044556, | |
| "step": 141, | |
| "step_time": 8.895234876999439 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.723241686820984, | |
| "epoch": 0.00142, | |
| "grad_norm": 0.044111333787441254, | |
| "kl": 0.3669211324304342, | |
| "learning_rate": 9.999998700223308e-06, | |
| "loss": -0.0217, | |
| "step": 142, | |
| "step_time": 4.655782125000314 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 866.0, | |
| "completions/max_terminated_length": 866.0, | |
| "completions/mean_length": 388.5625, | |
| "completions/mean_terminated_length": 388.5625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.8627594113349915, | |
| "epoch": 0.00143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01214392390102148, | |
| "kl": 0.37197905220091343, | |
| "learning_rate": 9.999998675583542e-06, | |
| "loss": -0.0186, | |
| "num_tokens": 3748064.0, | |
| "reward": 2.9884138107299805, | |
| "reward_std": 1.394155740737915, | |
| "rewards/rollout_reward_func/mean": 2.9884138107299805, | |
| "rewards/rollout_reward_func/std": 1.8353135585784912, | |
| "sampling/importance_sampling_ratio/max": 0.5596773028373718, | |
| "sampling/importance_sampling_ratio/mean": 0.16319838166236877, | |
| "sampling/importance_sampling_ratio/min": 3.1172959769065756e-09, | |
| "sampling/sampling_logp_difference/max": 4.036569595336914, | |
| "sampling/sampling_logp_difference/mean": 1.0934593677520752, | |
| "step": 143, | |
| "step_time": 9.0844559330003 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.029166667722165585, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.029166667722165585, | |
| "entropy": 5.961900234222412, | |
| "epoch": 0.00144, | |
| "grad_norm": 0.0157622080296278, | |
| "kl": 0.3777771629393101, | |
| "learning_rate": 9.999998650712415e-06, | |
| "loss": -0.0185, | |
| "step": 144, | |
| "step_time": 5.404956216000301 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 866.0, | |
| "completions/max_terminated_length": 866.0, | |
| "completions/mean_length": 385.5625, | |
| "completions/mean_terminated_length": 389.3000183105469, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.166502773761749, | |
| "epoch": 0.00145, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027631990611553192, | |
| "kl": 0.5853241086006165, | |
| "learning_rate": 9.999998625609931e-06, | |
| "loss": -0.0256, | |
| "num_tokens": 3801052.0, | |
| "reward": 2.257948398590088, | |
| "reward_std": 1.2924938201904297, | |
| "rewards/rollout_reward_func/mean": 2.257948398590088, | |
| "rewards/rollout_reward_func/std": 1.759012222290039, | |
| "sampling/importance_sampling_ratio/max": 0.5587986707687378, | |
| "sampling/importance_sampling_ratio/mean": 0.2040739357471466, | |
| "sampling/importance_sampling_ratio/min": 1.3630546344400862e-12, | |
| "sampling/sampling_logp_difference/max": 3.9767651557922363, | |
| "sampling/sampling_logp_difference/mean": 0.9805353283882141, | |
| "step": 145, | |
| "step_time": 9.069786099000794 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 5.183136820793152, | |
| "epoch": 0.00146, | |
| "grad_norm": 0.026243364438414574, | |
| "kl": 0.5845871120691299, | |
| "learning_rate": 9.999998600276087e-06, | |
| "loss": -0.0256, | |
| "step": 146, | |
| "step_time": 4.969717237999248 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.015625, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 812.0, | |
| "completions/max_terminated_length": 812.0, | |
| "completions/mean_length": 382.4375, | |
| "completions/mean_terminated_length": 382.4375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.606759130954742, | |
| "epoch": 0.00147, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.06667429953813553, | |
| "kl": 0.5389902517199516, | |
| "learning_rate": 9.999998574710883e-06, | |
| "loss": -0.0173, | |
| "num_tokens": 3853406.0, | |
| "reward": 1.033826231956482, | |
| "reward_std": 0.8608077764511108, | |
| "rewards/rollout_reward_func/mean": 1.033826231956482, | |
| "rewards/rollout_reward_func/std": 1.2542723417282104, | |
| "sampling/importance_sampling_ratio/max": 0.3122768998146057, | |
| "sampling/importance_sampling_ratio/mean": 0.10800905525684357, | |
| "sampling/importance_sampling_ratio/min": 1.0712759864892608e-17, | |
| "sampling/sampling_logp_difference/max": 4.165693283081055, | |
| "sampling/sampling_logp_difference/mean": 1.3520691394805908, | |
| "step": 147, | |
| "step_time": 8.949542632000885 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08159722294658422, | |
| "clip_ratio/high_mean": 0.04079861147329211, | |
| "clip_ratio/low_mean": 0.02434501238167286, | |
| "clip_ratio/low_min": 0.013888888992369175, | |
| "clip_ratio/region_mean": 0.06514362338930368, | |
| "entropy": 6.569398641586304, | |
| "epoch": 0.00148, | |
| "grad_norm": 0.014427587389945984, | |
| "kl": 0.3714602068066597, | |
| "learning_rate": 9.999998548914318e-06, | |
| "loss": -0.0175, | |
| "step": 148, | |
| "step_time": 4.723479898000278 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 414.875, | |
| "completions/mean_terminated_length": 414.875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.415063202381134, | |
| "epoch": 0.00149, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006415513344109058, | |
| "kl": 0.3409431576728821, | |
| "learning_rate": 9.999998522886397e-06, | |
| "loss": -0.0151, | |
| "num_tokens": 3907390.0, | |
| "reward": 2.522676467895508, | |
| "reward_std": 1.2102174758911133, | |
| "rewards/rollout_reward_func/mean": 2.522676467895508, | |
| "rewards/rollout_reward_func/std": 1.6199886798858643, | |
| "sampling/importance_sampling_ratio/max": 0.5543047189712524, | |
| "sampling/importance_sampling_ratio/mean": 0.20426242053508759, | |
| "sampling/importance_sampling_ratio/min": 0.0003250113222748041, | |
| "sampling/sampling_logp_difference/max": 2.426882266998291, | |
| "sampling/sampling_logp_difference/mean": 0.9172008037567139, | |
| "step": 149, | |
| "step_time": 9.147840422000172 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.1197916679084301, | |
| "clip_ratio/high_mean": 0.05989583395421505, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05989583395421505, | |
| "entropy": 5.369048357009888, | |
| "epoch": 0.0015, | |
| "grad_norm": 0.009035247378051281, | |
| "kl": 0.3433597218245268, | |
| "learning_rate": 9.999998496627115e-06, | |
| "loss": -0.0151, | |
| "step": 150, | |
| "step_time": 4.844408977000057 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 462.09375, | |
| "completions/mean_terminated_length": 462.09375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.271999478340149, | |
| "epoch": 0.00151, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009318442083895206, | |
| "kl": 0.2838898357003927, | |
| "learning_rate": 9.999998470136475e-06, | |
| "loss": -0.0107, | |
| "num_tokens": 3963989.0, | |
| "reward": 2.4085025787353516, | |
| "reward_std": 1.5739270448684692, | |
| "rewards/rollout_reward_func/mean": 2.4085025787353516, | |
| "rewards/rollout_reward_func/std": 2.0519044399261475, | |
| "sampling/importance_sampling_ratio/max": 0.3041028082370758, | |
| "sampling/importance_sampling_ratio/mean": 0.10673074424266815, | |
| "sampling/importance_sampling_ratio/min": 3.432775631405483e-20, | |
| "sampling/sampling_logp_difference/max": 3.6299948692321777, | |
| "sampling/sampling_logp_difference/mean": 1.236303448677063, | |
| "step": 151, | |
| "step_time": 8.581440807000035 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.249309480190277, | |
| "epoch": 0.00152, | |
| "grad_norm": 0.009570055641233921, | |
| "kl": 0.2857303377240896, | |
| "learning_rate": 9.999998443414474e-06, | |
| "loss": -0.0106, | |
| "step": 152, | |
| "step_time": 4.792721901000277 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 661.0, | |
| "completions/max_terminated_length": 661.0, | |
| "completions/mean_length": 274.21875, | |
| "completions/mean_terminated_length": 274.21875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.317317992448807, | |
| "epoch": 0.00153, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011901522986590862, | |
| "kl": 0.47474728897213936, | |
| "learning_rate": 9.999998416461115e-06, | |
| "loss": -0.0159, | |
| "num_tokens": 4010768.0, | |
| "reward": 3.833049774169922, | |
| "reward_std": 1.0186306238174438, | |
| "rewards/rollout_reward_func/mean": 3.833049774169922, | |
| "rewards/rollout_reward_func/std": 1.4386236667633057, | |
| "sampling/importance_sampling_ratio/max": 0.5543802380561829, | |
| "sampling/importance_sampling_ratio/mean": 0.2531720995903015, | |
| "sampling/importance_sampling_ratio/min": 1.1614738034196326e-13, | |
| "sampling/sampling_logp_difference/max": 4.201288223266602, | |
| "sampling/sampling_logp_difference/mean": 1.0024373531341553, | |
| "step": 153, | |
| "step_time": 8.65931397399936 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.322837769985199, | |
| "epoch": 0.00154, | |
| "grad_norm": 0.011569921858608723, | |
| "kl": 0.4757602885365486, | |
| "learning_rate": 9.999998389276397e-06, | |
| "loss": -0.016, | |
| "step": 154, | |
| "step_time": 4.628009893999661 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 866.0, | |
| "completions/max_terminated_length": 866.0, | |
| "completions/mean_length": 608.03125, | |
| "completions/mean_terminated_length": 608.03125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.3297582268714905, | |
| "epoch": 0.00155, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012206662446260452, | |
| "kl": 0.3360460437834263, | |
| "learning_rate": 9.999998361860319e-06, | |
| "loss": -0.0247, | |
| "num_tokens": 4070767.0, | |
| "reward": 1.6205885410308838, | |
| "reward_std": 0.9136905670166016, | |
| "rewards/rollout_reward_func/mean": 1.6205885410308838, | |
| "rewards/rollout_reward_func/std": 1.4889774322509766, | |
| "sampling/importance_sampling_ratio/max": 0.307391494512558, | |
| "sampling/importance_sampling_ratio/mean": 0.12577039003372192, | |
| "sampling/importance_sampling_ratio/min": 3.3397945867208456e-12, | |
| "sampling/sampling_logp_difference/max": 3.970939874649048, | |
| "sampling/sampling_logp_difference/mean": 1.1507443189620972, | |
| "step": 155, | |
| "step_time": 9.421827424999265 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.354883968830109, | |
| "epoch": 0.00156, | |
| "grad_norm": 0.011644980870187283, | |
| "kl": 0.3347685132175684, | |
| "learning_rate": 9.99999833421288e-06, | |
| "loss": -0.0247, | |
| "step": 156, | |
| "step_time": 4.989567845999318 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0034722222480922937, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 848.0, | |
| "completions/max_terminated_length": 848.0, | |
| "completions/mean_length": 306.9375, | |
| "completions/mean_terminated_length": 316.32257080078125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.295707732439041, | |
| "epoch": 0.00157, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.006841656286269426, | |
| "kl": 0.45711028575897217, | |
| "learning_rate": 9.999998306334084e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 4119100.0, | |
| "reward": 3.2127416133880615, | |
| "reward_std": 0.8213455677032471, | |
| "rewards/rollout_reward_func/mean": 3.2127416133880615, | |
| "rewards/rollout_reward_func/std": 1.3526753187179565, | |
| "sampling/importance_sampling_ratio/max": 0.5526928901672363, | |
| "sampling/importance_sampling_ratio/mean": 0.20098578929901123, | |
| "sampling/importance_sampling_ratio/min": 6.629302141958338e-13, | |
| "sampling/sampling_logp_difference/max": 3.455430030822754, | |
| "sampling/sampling_logp_difference/mean": 1.1175944805145264, | |
| "step": 157, | |
| "step_time": 8.230952549999529 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.352945655584335, | |
| "epoch": 0.00158, | |
| "grad_norm": 0.0065981000661849976, | |
| "kl": 0.4474712498486042, | |
| "learning_rate": 9.99999827822393e-06, | |
| "loss": 0.0064, | |
| "step": 158, | |
| "step_time": 5.171249369000179 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 731.0, | |
| "completions/max_terminated_length": 731.0, | |
| "completions/mean_length": 282.75, | |
| "completions/mean_terminated_length": 282.75, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.617429673671722, | |
| "epoch": 0.00159, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012413682416081429, | |
| "kl": 0.44416868314146996, | |
| "learning_rate": 9.999998249882414e-06, | |
| "loss": -0.0191, | |
| "num_tokens": 4166080.0, | |
| "reward": 2.787449836730957, | |
| "reward_std": 1.4347116947174072, | |
| "rewards/rollout_reward_func/mean": 2.787449836730957, | |
| "rewards/rollout_reward_func/std": 1.7297974824905396, | |
| "sampling/importance_sampling_ratio/max": 0.5567501187324524, | |
| "sampling/importance_sampling_ratio/mean": 0.2343512773513794, | |
| "sampling/importance_sampling_ratio/min": 3.580179833845965e-16, | |
| "sampling/sampling_logp_difference/max": 3.5624475479125977, | |
| "sampling/sampling_logp_difference/mean": 1.0345864295959473, | |
| "step": 159, | |
| "step_time": 8.048230264000267 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.655277848243713, | |
| "epoch": 0.0016, | |
| "grad_norm": 0.011771933175623417, | |
| "kl": 0.4446682333946228, | |
| "learning_rate": 9.999998221309542e-06, | |
| "loss": -0.0191, | |
| "step": 160, | |
| "step_time": 4.566467165000631 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 649.0, | |
| "completions/max_terminated_length": 649.0, | |
| "completions/mean_length": 233.40625, | |
| "completions/mean_terminated_length": 233.40625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.265278160572052, | |
| "epoch": 0.00161, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013585682958364487, | |
| "kl": 0.4800906181335449, | |
| "learning_rate": 9.999998192505309e-06, | |
| "loss": -0.0153, | |
| "num_tokens": 4211781.0, | |
| "reward": 2.0160932540893555, | |
| "reward_std": 1.3408175706863403, | |
| "rewards/rollout_reward_func/mean": 2.0160932540893555, | |
| "rewards/rollout_reward_func/std": 1.7153640985488892, | |
| "sampling/importance_sampling_ratio/max": 0.5554280281066895, | |
| "sampling/importance_sampling_ratio/mean": 0.18753069639205933, | |
| "sampling/importance_sampling_ratio/min": 8.604307595305727e-07, | |
| "sampling/sampling_logp_difference/max": 4.6766486167907715, | |
| "sampling/sampling_logp_difference/mean": 1.2480790615081787, | |
| "step": 161, | |
| "step_time": 8.229338431000087 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.281177341938019, | |
| "epoch": 0.00162, | |
| "grad_norm": 0.012755308300256729, | |
| "kl": 0.4964945949614048, | |
| "learning_rate": 9.999998163469716e-06, | |
| "loss": -0.0154, | |
| "step": 162, | |
| "step_time": 4.38510970399966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 778.0, | |
| "completions/max_terminated_length": 778.0, | |
| "completions/mean_length": 557.375, | |
| "completions/mean_terminated_length": 552.1290283203125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.4221086502075195, | |
| "epoch": 0.00163, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007355245761573315, | |
| "kl": 0.330900888890028, | |
| "learning_rate": 9.999998134202764e-06, | |
| "loss": -0.0155, | |
| "num_tokens": 4269709.0, | |
| "reward": 2.1687724590301514, | |
| "reward_std": 1.417752981185913, | |
| "rewards/rollout_reward_func/mean": 2.1687724590301514, | |
| "rewards/rollout_reward_func/std": 1.9265196323394775, | |
| "sampling/importance_sampling_ratio/max": 0.30897635221481323, | |
| "sampling/importance_sampling_ratio/mean": 0.12372960150241852, | |
| "sampling/importance_sampling_ratio/min": 2.4737660090288395e-17, | |
| "sampling/sampling_logp_difference/max": 3.5442941188812256, | |
| "sampling/sampling_logp_difference/mean": 1.2593567371368408, | |
| "step": 163, | |
| "step_time": 8.674119632999918 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.411408066749573, | |
| "epoch": 0.00164, | |
| "grad_norm": 0.006261439062654972, | |
| "kl": 0.32452805899083614, | |
| "learning_rate": 9.999998104704453e-06, | |
| "loss": -0.0155, | |
| "step": 164, | |
| "step_time": 5.15820460599889 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 805.0, | |
| "completions/max_terminated_length": 805.0, | |
| "completions/mean_length": 324.21875, | |
| "completions/mean_terminated_length": 324.21875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.876501500606537, | |
| "epoch": 0.00165, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.010572957806289196, | |
| "kl": 0.314052717294544, | |
| "learning_rate": 9.999998074974785e-06, | |
| "loss": -0.0065, | |
| "num_tokens": 4317340.0, | |
| "reward": 2.824183702468872, | |
| "reward_std": 1.7697460651397705, | |
| "rewards/rollout_reward_func/mean": 2.824183702468872, | |
| "rewards/rollout_reward_func/std": 1.7482030391693115, | |
| "sampling/importance_sampling_ratio/max": 0.5577415227890015, | |
| "sampling/importance_sampling_ratio/mean": 0.18741005659103394, | |
| "sampling/importance_sampling_ratio/min": 0.0003228384011890739, | |
| "sampling/sampling_logp_difference/max": 2.3370163440704346, | |
| "sampling/sampling_logp_difference/mean": 1.2416086196899414, | |
| "step": 165, | |
| "step_time": 8.260381059999418 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.816007494926453, | |
| "epoch": 0.00166, | |
| "grad_norm": 0.010662592947483063, | |
| "kl": 0.31538827205076814, | |
| "learning_rate": 9.999998045013754e-06, | |
| "loss": -0.0065, | |
| "step": 166, | |
| "step_time": 4.7127935759995125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 803.0, | |
| "completions/max_terminated_length": 803.0, | |
| "completions/mean_length": 546.21875, | |
| "completions/mean_terminated_length": 546.21875, | |
| "completions/min_length": 410.0, | |
| "completions/min_terminated_length": 410.0, | |
| "entropy": 4.806870490312576, | |
| "epoch": 0.00167, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00802002102136612, | |
| "kl": 0.3522371258586645, | |
| "learning_rate": 9.999998014821366e-06, | |
| "loss": -0.0125, | |
| "num_tokens": 4376707.0, | |
| "reward": 4.180271148681641, | |
| "reward_std": 1.340552568435669, | |
| "rewards/rollout_reward_func/mean": 4.180271148681641, | |
| "rewards/rollout_reward_func/std": 1.3307310342788696, | |
| "sampling/importance_sampling_ratio/max": 0.31147125363349915, | |
| "sampling/importance_sampling_ratio/mean": 0.18969742953777313, | |
| "sampling/importance_sampling_ratio/min": 2.4195236403606748e-17, | |
| "sampling/sampling_logp_difference/max": 3.992608070373535, | |
| "sampling/sampling_logp_difference/mean": 0.8682562112808228, | |
| "step": 167, | |
| "step_time": 9.083963717000188 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 4.7781093418598175, | |
| "epoch": 0.00168, | |
| "grad_norm": 0.008071971125900745, | |
| "kl": 0.3539010286331177, | |
| "learning_rate": 9.999997984397618e-06, | |
| "loss": -0.0125, | |
| "step": 168, | |
| "step_time": 4.795684135000101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 740.0, | |
| "completions/max_terminated_length": 740.0, | |
| "completions/mean_length": 399.5, | |
| "completions/mean_terminated_length": 399.5, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.353841185569763, | |
| "epoch": 0.00169, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.010215525515377522, | |
| "kl": 0.4602816812694073, | |
| "learning_rate": 9.999997953742511e-06, | |
| "loss": -0.0044, | |
| "num_tokens": 4427943.0, | |
| "reward": 2.107025146484375, | |
| "reward_std": 1.3496294021606445, | |
| "rewards/rollout_reward_func/mean": 2.107025146484375, | |
| "rewards/rollout_reward_func/std": 1.8857003450393677, | |
| "sampling/importance_sampling_ratio/max": 0.5592086315155029, | |
| "sampling/importance_sampling_ratio/mean": 0.20410695672035217, | |
| "sampling/importance_sampling_ratio/min": 3.2384990522604795e-11, | |
| "sampling/sampling_logp_difference/max": 3.561155080795288, | |
| "sampling/sampling_logp_difference/mean": 0.9430296421051025, | |
| "step": 169, | |
| "step_time": 8.44894137799929 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02083333395421505, | |
| "clip_ratio/high_mean": 0.010416666977107525, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 5.303603112697601, | |
| "epoch": 0.0017, | |
| "grad_norm": 0.010156241245567799, | |
| "kl": 0.4662858620285988, | |
| "learning_rate": 9.999997922856044e-06, | |
| "loss": -0.0044, | |
| "step": 170, | |
| "step_time": 5.110951400999511 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 733.0, | |
| "completions/max_terminated_length": 733.0, | |
| "completions/mean_length": 281.78125, | |
| "completions/mean_terminated_length": 281.78125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.283334970474243, | |
| "epoch": 0.00171, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014388482086360455, | |
| "kl": 0.5276032239198685, | |
| "learning_rate": 9.999997891738219e-06, | |
| "loss": -0.0148, | |
| "num_tokens": 4475664.0, | |
| "reward": 3.1777913570404053, | |
| "reward_std": 0.9462900757789612, | |
| "rewards/rollout_reward_func/mean": 3.1777913570404053, | |
| "rewards/rollout_reward_func/std": 1.581575870513916, | |
| "sampling/importance_sampling_ratio/max": 0.5628533959388733, | |
| "sampling/importance_sampling_ratio/mean": 0.24305231869220734, | |
| "sampling/importance_sampling_ratio/min": 0.0005128039629198611, | |
| "sampling/sampling_logp_difference/max": 2.273298501968384, | |
| "sampling/sampling_logp_difference/mean": 0.8266535997390747, | |
| "step": 171, | |
| "step_time": 8.091583472000366 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.257599115371704, | |
| "epoch": 0.00172, | |
| "grad_norm": 0.014056864194571972, | |
| "kl": 0.5337305925786495, | |
| "learning_rate": 9.999997860389035e-06, | |
| "loss": -0.0148, | |
| "step": 172, | |
| "step_time": 4.970710163999684 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 814.0, | |
| "completions/max_terminated_length": 814.0, | |
| "completions/mean_length": 548.75, | |
| "completions/mean_terminated_length": 548.75, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.598077058792114, | |
| "epoch": 0.00173, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007702630013227463, | |
| "kl": 0.26329412683844566, | |
| "learning_rate": 9.99999782880849e-06, | |
| "loss": -0.0126, | |
| "num_tokens": 4534968.0, | |
| "reward": 3.599264144897461, | |
| "reward_std": 1.2832717895507812, | |
| "rewards/rollout_reward_func/mean": 3.599264144897461, | |
| "rewards/rollout_reward_func/std": 1.7741010189056396, | |
| "sampling/importance_sampling_ratio/max": 0.3131314814090729, | |
| "sampling/importance_sampling_ratio/mean": 0.15425240993499756, | |
| "sampling/importance_sampling_ratio/min": 0.0001230030320584774, | |
| "sampling/sampling_logp_difference/max": 3.5723962783813477, | |
| "sampling/sampling_logp_difference/mean": 0.9245375394821167, | |
| "step": 173, | |
| "step_time": 8.712299219000215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.617673218250275, | |
| "epoch": 0.00174, | |
| "grad_norm": 0.007035167887806892, | |
| "kl": 0.2626843862235546, | |
| "learning_rate": 9.999997796996588e-06, | |
| "loss": -0.0126, | |
| "step": 174, | |
| "step_time": 4.817434919999414 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 866.0, | |
| "completions/max_terminated_length": 866.0, | |
| "completions/mean_length": 611.65625, | |
| "completions/mean_terminated_length": 611.65625, | |
| "completions/min_length": 18.0, | |
| "completions/min_terminated_length": 18.0, | |
| "entropy": 5.546254634857178, | |
| "epoch": 0.00175, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007665017154067755, | |
| "kl": 0.2765345424413681, | |
| "learning_rate": 9.999997764953326e-06, | |
| "loss": -0.002, | |
| "num_tokens": 4596111.0, | |
| "reward": 3.0970826148986816, | |
| "reward_std": 1.2895809412002563, | |
| "rewards/rollout_reward_func/mean": 3.0970826148986816, | |
| "rewards/rollout_reward_func/std": 1.63455331325531, | |
| "sampling/importance_sampling_ratio/max": 0.31010711193084717, | |
| "sampling/importance_sampling_ratio/mean": 0.14346721768379211, | |
| "sampling/importance_sampling_ratio/min": 2.920134970419719e-20, | |
| "sampling/sampling_logp_difference/max": 14.014349937438965, | |
| "sampling/sampling_logp_difference/mean": 1.0683830976486206, | |
| "step": 175, | |
| "step_time": 9.429678833000253 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.569147884845734, | |
| "epoch": 0.00176, | |
| "grad_norm": 0.007943429052829742, | |
| "kl": 0.27362857572734356, | |
| "learning_rate": 9.999997732678706e-06, | |
| "loss": -0.002, | |
| "step": 176, | |
| "step_time": 4.9659347020001405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 595.0, | |
| "completions/max_terminated_length": 595.0, | |
| "completions/mean_length": 435.15625, | |
| "completions/mean_terminated_length": 432.58062744140625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.725856065750122, | |
| "epoch": 0.00177, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009265062399208546, | |
| "kl": 0.3029242120683193, | |
| "learning_rate": 9.999997700172724e-06, | |
| "loss": -0.0202, | |
| "num_tokens": 4651072.0, | |
| "reward": 3.301889181137085, | |
| "reward_std": 1.8498287200927734, | |
| "rewards/rollout_reward_func/mean": 3.301889181137085, | |
| "rewards/rollout_reward_func/std": 2.1379148960113525, | |
| "sampling/importance_sampling_ratio/max": 0.3099573254585266, | |
| "sampling/importance_sampling_ratio/mean": 0.14696773886680603, | |
| "sampling/importance_sampling_ratio/min": 5.326042341532999e-13, | |
| "sampling/sampling_logp_difference/max": 4.241425037384033, | |
| "sampling/sampling_logp_difference/mean": 1.0613305568695068, | |
| "step": 177, | |
| "step_time": 8.000326000999848 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.7441646456718445, | |
| "epoch": 0.00178, | |
| "grad_norm": 0.009017222560942173, | |
| "kl": 0.2993904184550047, | |
| "learning_rate": 9.999997667435383e-06, | |
| "loss": -0.0202, | |
| "step": 178, | |
| "step_time": 4.911030336000749 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 821.0, | |
| "completions/max_terminated_length": 821.0, | |
| "completions/mean_length": 627.59375, | |
| "completions/mean_terminated_length": 622.8709716796875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.733899414539337, | |
| "epoch": 0.00179, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015371856279671192, | |
| "kl": 0.39605566393584013, | |
| "learning_rate": 9.999997634466684e-06, | |
| "loss": -0.0028, | |
| "num_tokens": 4713163.0, | |
| "reward": 2.408566951751709, | |
| "reward_std": 1.2345832586288452, | |
| "rewards/rollout_reward_func/mean": 2.408566951751709, | |
| "rewards/rollout_reward_func/std": 1.5583417415618896, | |
| "sampling/importance_sampling_ratio/max": 0.3045631945133209, | |
| "sampling/importance_sampling_ratio/mean": 0.14088614284992218, | |
| "sampling/importance_sampling_ratio/min": 3.260920675229406e-17, | |
| "sampling/sampling_logp_difference/max": 12.893571853637695, | |
| "sampling/sampling_logp_difference/mean": 1.2033199071884155, | |
| "step": 179, | |
| "step_time": 9.336971251999785 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.755895584821701, | |
| "epoch": 0.0018, | |
| "grad_norm": 0.013228918425738811, | |
| "kl": 0.38902273029088974, | |
| "learning_rate": 9.999997601266627e-06, | |
| "loss": -0.0029, | |
| "step": 180, | |
| "step_time": 4.989180980000128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 803.0, | |
| "completions/max_terminated_length": 803.0, | |
| "completions/mean_length": 591.15625, | |
| "completions/mean_terminated_length": 591.15625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.095756113529205, | |
| "epoch": 0.00181, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007322733756154776, | |
| "kl": 0.3757977671921253, | |
| "learning_rate": 9.999997567835209e-06, | |
| "loss": -0.0162, | |
| "num_tokens": 4774032.0, | |
| "reward": 2.0675711631774902, | |
| "reward_std": 0.6423474550247192, | |
| "rewards/rollout_reward_func/mean": 2.0675711631774902, | |
| "rewards/rollout_reward_func/std": 1.5976166725158691, | |
| "sampling/importance_sampling_ratio/max": 0.30949416756629944, | |
| "sampling/importance_sampling_ratio/mean": 0.1696314811706543, | |
| "sampling/importance_sampling_ratio/min": 0.0006361076375469565, | |
| "sampling/sampling_logp_difference/max": 3.4188907146453857, | |
| "sampling/sampling_logp_difference/mean": 0.8557475805282593, | |
| "step": 181, | |
| "step_time": 9.625972572999672 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.120969295501709, | |
| "epoch": 0.00182, | |
| "grad_norm": 0.006717793643474579, | |
| "kl": 0.3749655243009329, | |
| "learning_rate": 9.999997534172434e-06, | |
| "loss": -0.0162, | |
| "step": 182, | |
| "step_time": 4.893760320000183 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 662.0, | |
| "completions/max_terminated_length": 662.0, | |
| "completions/mean_length": 281.8125, | |
| "completions/mean_terminated_length": 281.8125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.411740601062775, | |
| "epoch": 0.00183, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01275318581610918, | |
| "kl": 0.4849829040467739, | |
| "learning_rate": 9.999997500278298e-06, | |
| "loss": -0.012, | |
| "num_tokens": 4821524.0, | |
| "reward": 3.822073459625244, | |
| "reward_std": 1.2214256525039673, | |
| "rewards/rollout_reward_func/mean": 3.822073459625244, | |
| "rewards/rollout_reward_func/std": 1.457135558128357, | |
| "sampling/importance_sampling_ratio/max": 0.5602318644523621, | |
| "sampling/importance_sampling_ratio/mean": 0.2258593738079071, | |
| "sampling/importance_sampling_ratio/min": 8.38409050629707e-06, | |
| "sampling/sampling_logp_difference/max": 3.740847587585449, | |
| "sampling/sampling_logp_difference/mean": 0.9152244925498962, | |
| "step": 183, | |
| "step_time": 7.8192755909999505 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 5.4009730219841, | |
| "epoch": 0.00184, | |
| "grad_norm": 0.012373281642794609, | |
| "kl": 0.49285488575696945, | |
| "learning_rate": 9.999997466152803e-06, | |
| "loss": -0.012, | |
| "step": 184, | |
| "step_time": 4.937281341000016 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 794.0, | |
| "completions/max_terminated_length": 794.0, | |
| "completions/mean_length": 324.21875, | |
| "completions/mean_terminated_length": 324.21875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.895534723997116, | |
| "epoch": 0.00185, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.12174241244792938, | |
| "kl": 1.1918272729963064, | |
| "learning_rate": 9.999997431795949e-06, | |
| "loss": -0.0079, | |
| "num_tokens": 4872521.0, | |
| "reward": 2.4634082317352295, | |
| "reward_std": 0.7030278444290161, | |
| "rewards/rollout_reward_func/mean": 2.4634082317352295, | |
| "rewards/rollout_reward_func/std": 1.6452993154525757, | |
| "sampling/importance_sampling_ratio/max": 0.5586066246032715, | |
| "sampling/importance_sampling_ratio/mean": 0.22821499407291412, | |
| "sampling/importance_sampling_ratio/min": 2.045808008557789e-24, | |
| "sampling/sampling_logp_difference/max": 11.78732967376709, | |
| "sampling/sampling_logp_difference/mean": 1.2645323276519775, | |
| "step": 185, | |
| "step_time": 8.363861161000386 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021875000093132257, | |
| "clip_ratio/high_mean": 0.010937500046566129, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010937500046566129, | |
| "entropy": 5.90417143702507, | |
| "epoch": 0.00186, | |
| "grad_norm": 0.06167863681912422, | |
| "kl": 0.7666090168058872, | |
| "learning_rate": 9.999997397207736e-06, | |
| "loss": -0.0089, | |
| "step": 186, | |
| "step_time": 4.719343276000018 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 767.0, | |
| "completions/max_terminated_length": 767.0, | |
| "completions/mean_length": 563.84375, | |
| "completions/mean_terminated_length": 581.51611328125, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 312.0, | |
| "entropy": 4.6642705500125885, | |
| "epoch": 0.00187, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011750214733183384, | |
| "kl": 0.43587249517440796, | |
| "learning_rate": 9.999997362388163e-06, | |
| "loss": -0.0142, | |
| "num_tokens": 4933144.0, | |
| "reward": 2.8686366081237793, | |
| "reward_std": 1.2804789543151855, | |
| "rewards/rollout_reward_func/mean": 2.8686366081237793, | |
| "rewards/rollout_reward_func/std": 1.7704367637634277, | |
| "sampling/importance_sampling_ratio/max": 0.3205587863922119, | |
| "sampling/importance_sampling_ratio/mean": 0.20105448365211487, | |
| "sampling/importance_sampling_ratio/min": 1.161242035863097e-08, | |
| "sampling/sampling_logp_difference/max": 3.013947010040283, | |
| "sampling/sampling_logp_difference/mean": 0.8054625988006592, | |
| "step": 187, | |
| "step_time": 9.18670280699962 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.6675141751766205, | |
| "epoch": 0.00188, | |
| "grad_norm": 0.009789633564651012, | |
| "kl": 0.4357186071574688, | |
| "learning_rate": 9.999997327337232e-06, | |
| "loss": -0.0142, | |
| "step": 188, | |
| "step_time": 4.775712918999034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 796.0, | |
| "completions/max_terminated_length": 796.0, | |
| "completions/mean_length": 487.1875, | |
| "completions/mean_terminated_length": 487.1875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.183352828025818, | |
| "epoch": 0.00189, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007217842619866133, | |
| "kl": 0.36203115805983543, | |
| "learning_rate": 9.99999729205494e-06, | |
| "loss": -0.0087, | |
| "num_tokens": 4991196.0, | |
| "reward": 2.807063102722168, | |
| "reward_std": 1.7962517738342285, | |
| "rewards/rollout_reward_func/mean": 2.807063102722168, | |
| "rewards/rollout_reward_func/std": 2.033599853515625, | |
| "sampling/importance_sampling_ratio/max": 0.3048250079154968, | |
| "sampling/importance_sampling_ratio/mean": 0.13251593708992004, | |
| "sampling/importance_sampling_ratio/min": 0.00014422468666452914, | |
| "sampling/sampling_logp_difference/max": 2.6139421463012695, | |
| "sampling/sampling_logp_difference/mean": 1.1694201231002808, | |
| "step": 189, | |
| "step_time": 8.603061843000887 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.190056622028351, | |
| "epoch": 0.0019, | |
| "grad_norm": 0.0067920200526714325, | |
| "kl": 0.3644371014088392, | |
| "learning_rate": 9.99999725654129e-06, | |
| "loss": -0.0087, | |
| "step": 190, | |
| "step_time": 5.3103096270001515 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 857.0, | |
| "completions/max_terminated_length": 857.0, | |
| "completions/mean_length": 593.125, | |
| "completions/mean_terminated_length": 585.0967407226562, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.34462833404541, | |
| "epoch": 0.00191, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0056811366230249405, | |
| "kl": 0.34313227608799934, | |
| "learning_rate": 9.999997220796281e-06, | |
| "loss": -0.0185, | |
| "num_tokens": 5052550.0, | |
| "reward": 3.0586280822753906, | |
| "reward_std": 1.8891185522079468, | |
| "rewards/rollout_reward_func/mean": 3.0586280822753906, | |
| "rewards/rollout_reward_func/std": 1.9580395221710205, | |
| "sampling/importance_sampling_ratio/max": 0.3123818635940552, | |
| "sampling/importance_sampling_ratio/mean": 0.17983976006507874, | |
| "sampling/importance_sampling_ratio/min": 1.8176118621212876e-21, | |
| "sampling/sampling_logp_difference/max": 3.9357526302337646, | |
| "sampling/sampling_logp_difference/mean": 1.0673115253448486, | |
| "step": 191, | |
| "step_time": 9.136948834000577 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.350877106189728, | |
| "epoch": 0.00192, | |
| "grad_norm": 0.006298670079559088, | |
| "kl": 0.3456762544810772, | |
| "learning_rate": 9.999997184819913e-06, | |
| "loss": -0.0185, | |
| "step": 192, | |
| "step_time": 5.421018731999538 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 805.0, | |
| "completions/max_terminated_length": 805.0, | |
| "completions/mean_length": 463.0625, | |
| "completions/mean_terminated_length": 463.0625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.9372279047966, | |
| "epoch": 0.00193, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006436275318264961, | |
| "kl": 0.4286567382514477, | |
| "learning_rate": 9.999997148612186e-06, | |
| "loss": -0.0093, | |
| "num_tokens": 5107904.0, | |
| "reward": 2.5031309127807617, | |
| "reward_std": 0.7777051329612732, | |
| "rewards/rollout_reward_func/mean": 2.5031309127807617, | |
| "rewards/rollout_reward_func/std": 1.5234891176223755, | |
| "sampling/importance_sampling_ratio/max": 0.556627631187439, | |
| "sampling/importance_sampling_ratio/mean": 0.24726936221122742, | |
| "sampling/importance_sampling_ratio/min": 1.4663258658697762e-21, | |
| "sampling/sampling_logp_difference/max": 11.847373008728027, | |
| "sampling/sampling_logp_difference/mean": 1.019608974456787, | |
| "step": 193, | |
| "step_time": 8.926773718000277 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.932731568813324, | |
| "epoch": 0.00194, | |
| "grad_norm": 0.006187028717249632, | |
| "kl": 0.42864546552300453, | |
| "learning_rate": 9.9999971121731e-06, | |
| "loss": -0.0093, | |
| "step": 194, | |
| "step_time": 4.810186862000137 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 796.0, | |
| "completions/max_terminated_length": 796.0, | |
| "completions/mean_length": 289.0, | |
| "completions/mean_terminated_length": 289.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.024335443973541, | |
| "epoch": 0.00195, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014793330803513527, | |
| "kl": 0.5311180464923382, | |
| "learning_rate": 9.999997075502653e-06, | |
| "loss": -0.0, | |
| "num_tokens": 5153943.0, | |
| "reward": 2.680002212524414, | |
| "reward_std": 1.0660200119018555, | |
| "rewards/rollout_reward_func/mean": 2.680002212524414, | |
| "rewards/rollout_reward_func/std": 1.608717441558838, | |
| "sampling/importance_sampling_ratio/max": 0.556300938129425, | |
| "sampling/importance_sampling_ratio/mean": 0.1869335174560547, | |
| "sampling/importance_sampling_ratio/min": 8.39509803904695e-14, | |
| "sampling/sampling_logp_difference/max": 3.1887903213500977, | |
| "sampling/sampling_logp_difference/mean": 1.1006312370300293, | |
| "step": 195, | |
| "step_time": 8.258161678000306 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.993339031934738, | |
| "epoch": 0.00196, | |
| "grad_norm": 0.01399671845138073, | |
| "kl": 0.5270919986069202, | |
| "learning_rate": 9.999997038600848e-06, | |
| "loss": -0.0, | |
| "step": 196, | |
| "step_time": 5.157405102000212 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 810.0, | |
| "completions/max_terminated_length": 776.0, | |
| "completions/mean_length": 596.125, | |
| "completions/mean_terminated_length": 586.7333374023438, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.484014600515366, | |
| "epoch": 0.00197, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011650401167571545, | |
| "kl": 0.25742718297988176, | |
| "learning_rate": 9.999997001467682e-06, | |
| "loss": -0.0218, | |
| "num_tokens": 5214471.0, | |
| "reward": 2.370987892150879, | |
| "reward_std": 1.2418047189712524, | |
| "rewards/rollout_reward_func/mean": 2.370987892150879, | |
| "rewards/rollout_reward_func/std": 1.7970783710479736, | |
| "sampling/importance_sampling_ratio/max": 0.30147287249565125, | |
| "sampling/importance_sampling_ratio/mean": 0.12158507108688354, | |
| "sampling/importance_sampling_ratio/min": 1.8635179788931512e-17, | |
| "sampling/sampling_logp_difference/max": 4.204184055328369, | |
| "sampling/sampling_logp_difference/mean": 1.3044867515563965, | |
| "step": 197, | |
| "step_time": 9.104788559999633 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.468224465847015, | |
| "epoch": 0.00198, | |
| "grad_norm": 0.011098474264144897, | |
| "kl": 0.2528890473768115, | |
| "learning_rate": 9.99999696410316e-06, | |
| "loss": -0.0218, | |
| "step": 198, | |
| "step_time": 5.2834800850005195 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 742.0, | |
| "completions/max_terminated_length": 742.0, | |
| "completions/mean_length": 425.25, | |
| "completions/mean_terminated_length": 425.25, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.416600346565247, | |
| "epoch": 0.00199, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.6569156646728516, | |
| "kl": 1.3255293928086758, | |
| "learning_rate": 9.999996926507279e-06, | |
| "loss": -0.0115, | |
| "num_tokens": 5267520.0, | |
| "reward": 3.124060869216919, | |
| "reward_std": 1.0585153102874756, | |
| "rewards/rollout_reward_func/mean": 3.124060869216919, | |
| "rewards/rollout_reward_func/std": 1.6291773319244385, | |
| "sampling/importance_sampling_ratio/max": 0.5548077821731567, | |
| "sampling/importance_sampling_ratio/mean": 0.2113969922065735, | |
| "sampling/importance_sampling_ratio/min": 0.0004637441597878933, | |
| "sampling/sampling_logp_difference/max": 2.2059574127197266, | |
| "sampling/sampling_logp_difference/mean": 0.9068723320960999, | |
| "step": 199, | |
| "step_time": 8.44307382299985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 5.432026922702789, | |
| "epoch": 0.002, | |
| "grad_norm": 0.008453106507658958, | |
| "kl": 0.38647904247045517, | |
| "learning_rate": 9.999996888680038e-06, | |
| "loss": -0.014, | |
| "step": 200, | |
| "step_time": 4.681182854999861 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 712.0, | |
| "completions/max_terminated_length": 712.0, | |
| "completions/mean_length": 360.5625, | |
| "completions/mean_terminated_length": 371.6773986816406, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.117161571979523, | |
| "epoch": 0.00201, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01611809805035591, | |
| "kl": 0.4253620970994234, | |
| "learning_rate": 9.999996850621436e-06, | |
| "loss": -0.0152, | |
| "num_tokens": 5318093.0, | |
| "reward": 2.912040948867798, | |
| "reward_std": 0.7651806473731995, | |
| "rewards/rollout_reward_func/mean": 2.912040948867798, | |
| "rewards/rollout_reward_func/std": 1.8777114152908325, | |
| "sampling/importance_sampling_ratio/max": 0.556606113910675, | |
| "sampling/importance_sampling_ratio/mean": 0.21097299456596375, | |
| "sampling/importance_sampling_ratio/min": 2.815273847378563e-10, | |
| "sampling/sampling_logp_difference/max": 4.575296401977539, | |
| "sampling/sampling_logp_difference/mean": 0.9073153734207153, | |
| "step": 201, | |
| "step_time": 8.590784671000165 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015625, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 5.127772510051727, | |
| "epoch": 0.00202, | |
| "grad_norm": 0.01754254475235939, | |
| "kl": 0.42357040755450726, | |
| "learning_rate": 9.999996812331476e-06, | |
| "loss": -0.0152, | |
| "step": 202, | |
| "step_time": 4.51354878300026 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022727273404598236, | |
| "clip_ratio/high_mean": 0.011363636702299118, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.011363636702299118, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 246.34375, | |
| "completions/mean_terminated_length": 251.36668395996094, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.282517075538635, | |
| "epoch": 0.00203, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014741458930075169, | |
| "kl": 0.41361628845334053, | |
| "learning_rate": 9.999996773810157e-06, | |
| "loss": -0.009, | |
| "num_tokens": 5363579.0, | |
| "reward": 2.7251787185668945, | |
| "reward_std": 0.8450720310211182, | |
| "rewards/rollout_reward_func/mean": 2.7251787185668945, | |
| "rewards/rollout_reward_func/std": 1.7295880317687988, | |
| "sampling/importance_sampling_ratio/max": 0.5568343997001648, | |
| "sampling/importance_sampling_ratio/mean": 0.1812913417816162, | |
| "sampling/importance_sampling_ratio/min": 9.55414493809214e-19, | |
| "sampling/sampling_logp_difference/max": 12.835467338562012, | |
| "sampling/sampling_logp_difference/mean": 1.3004571199417114, | |
| "step": 203, | |
| "step_time": 8.776359550999132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.294387757778168, | |
| "epoch": 0.00204, | |
| "grad_norm": 0.018707161769270897, | |
| "kl": 0.41007015481591225, | |
| "learning_rate": 9.99999673505748e-06, | |
| "loss": -0.0089, | |
| "step": 204, | |
| "step_time": 5.04047468599947 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 712.0, | |
| "completions/max_terminated_length": 712.0, | |
| "completions/mean_length": 226.71875, | |
| "completions/mean_terminated_length": 233.51612854003906, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.579507499933243, | |
| "epoch": 0.00205, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.015186581760644913, | |
| "kl": 0.4859627615660429, | |
| "learning_rate": 9.999996696073441e-06, | |
| "loss": -0.0153, | |
| "num_tokens": 5407892.0, | |
| "reward": 3.182164192199707, | |
| "reward_std": 1.4856499433517456, | |
| "rewards/rollout_reward_func/mean": 3.182164192199707, | |
| "rewards/rollout_reward_func/std": 1.7880464792251587, | |
| "sampling/importance_sampling_ratio/max": 0.5547261238098145, | |
| "sampling/importance_sampling_ratio/mean": 0.23533114790916443, | |
| "sampling/importance_sampling_ratio/min": 1.2176733044458061e-14, | |
| "sampling/sampling_logp_difference/max": 5.080104351043701, | |
| "sampling/sampling_logp_difference/mean": 1.195305347442627, | |
| "step": 205, | |
| "step_time": 8.019790785999703 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0024999999441206455, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0024999999441206455, | |
| "entropy": 5.567036747932434, | |
| "epoch": 0.00206, | |
| "grad_norm": 0.015953045338392258, | |
| "kl": 0.4858721327036619, | |
| "learning_rate": 9.999996656858045e-06, | |
| "loss": -0.0153, | |
| "step": 206, | |
| "step_time": 4.494372066999858 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 694.0, | |
| "completions/max_terminated_length": 694.0, | |
| "completions/mean_length": 257.25, | |
| "completions/mean_terminated_length": 265.0322570800781, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.651438236236572, | |
| "epoch": 0.00207, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02735012210905552, | |
| "kl": 0.4778033494949341, | |
| "learning_rate": 9.99999661741129e-06, | |
| "loss": -0.0107, | |
| "num_tokens": 5454600.0, | |
| "reward": 3.8261237144470215, | |
| "reward_std": 1.0173990726470947, | |
| "rewards/rollout_reward_func/mean": 3.8261237144470215, | |
| "rewards/rollout_reward_func/std": 1.3678010702133179, | |
| "sampling/importance_sampling_ratio/max": 0.5530157089233398, | |
| "sampling/importance_sampling_ratio/mean": 0.22838255763053894, | |
| "sampling/importance_sampling_ratio/min": 6.239156959964021e-07, | |
| "sampling/sampling_logp_difference/max": 3.2890069484710693, | |
| "sampling/sampling_logp_difference/mean": 0.932339072227478, | |
| "step": 207, | |
| "step_time": 8.438589544000479 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.5857046246528625, | |
| "epoch": 0.00208, | |
| "grad_norm": 0.026330100372433662, | |
| "kl": 0.4851069003343582, | |
| "learning_rate": 9.999996577733175e-06, | |
| "loss": -0.0107, | |
| "step": 208, | |
| "step_time": 4.460181868000291 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 830.0, | |
| "completions/max_terminated_length": 830.0, | |
| "completions/mean_length": 397.125, | |
| "completions/mean_terminated_length": 397.125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.61989688873291, | |
| "epoch": 0.00209, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.03519469127058983, | |
| "kl": 0.4516414776444435, | |
| "learning_rate": 9.9999965378237e-06, | |
| "loss": -0.0089, | |
| "num_tokens": 5507788.0, | |
| "reward": 2.7961456775665283, | |
| "reward_std": 1.177910566329956, | |
| "rewards/rollout_reward_func/mean": 2.7961456775665283, | |
| "rewards/rollout_reward_func/std": 1.9247767925262451, | |
| "sampling/importance_sampling_ratio/max": 0.5473094582557678, | |
| "sampling/importance_sampling_ratio/mean": 0.18874725699424744, | |
| "sampling/importance_sampling_ratio/min": 0.00020230493100825697, | |
| "sampling/sampling_logp_difference/max": 2.5100324153900146, | |
| "sampling/sampling_logp_difference/mean": 0.9597841501235962, | |
| "step": 209, | |
| "step_time": 9.165979287000027 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.536510765552521, | |
| "epoch": 0.0021, | |
| "grad_norm": 0.02714668959379196, | |
| "kl": 0.45711198449134827, | |
| "learning_rate": 9.999996497682868e-06, | |
| "loss": -0.009, | |
| "step": 210, | |
| "step_time": 4.771343512000385 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 695.15625, | |
| "completions/mean_terminated_length": 695.15625, | |
| "completions/min_length": 585.0, | |
| "completions/min_terminated_length": 585.0, | |
| "entropy": 5.806370496749878, | |
| "epoch": 0.00211, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02025657892227173, | |
| "kl": 0.2787773534655571, | |
| "learning_rate": 9.999996457310676e-06, | |
| "loss": -0.0145, | |
| "num_tokens": 5572137.0, | |
| "reward": 2.248298168182373, | |
| "reward_std": 0.9219829440116882, | |
| "rewards/rollout_reward_func/mean": 2.248298168182373, | |
| "rewards/rollout_reward_func/std": 1.4261958599090576, | |
| "sampling/importance_sampling_ratio/max": 0.301658034324646, | |
| "sampling/importance_sampling_ratio/mean": 0.13740620017051697, | |
| "sampling/importance_sampling_ratio/min": 9.007880552580616e-10, | |
| "sampling/sampling_logp_difference/max": 3.91507887840271, | |
| "sampling/sampling_logp_difference/mean": 0.9571284055709839, | |
| "step": 211, | |
| "step_time": 9.053193517001091 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.7584188580513, | |
| "epoch": 0.00212, | |
| "grad_norm": 0.017554691061377525, | |
| "kl": 0.28337166644632816, | |
| "learning_rate": 9.999996416707125e-06, | |
| "loss": -0.0145, | |
| "step": 212, | |
| "step_time": 4.910072835999927 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 794.0, | |
| "completions/max_terminated_length": 794.0, | |
| "completions/mean_length": 306.875, | |
| "completions/mean_terminated_length": 306.875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.477221041917801, | |
| "epoch": 0.00213, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.012003457173705101, | |
| "kl": 0.6022003293037415, | |
| "learning_rate": 9.999996375872214e-06, | |
| "loss": -0.0063, | |
| "num_tokens": 5618850.0, | |
| "reward": 2.7603349685668945, | |
| "reward_std": 0.35092732310295105, | |
| "rewards/rollout_reward_func/mean": 2.7603349685668945, | |
| "rewards/rollout_reward_func/std": 1.3781588077545166, | |
| "sampling/importance_sampling_ratio/max": 0.5524585843086243, | |
| "sampling/importance_sampling_ratio/mean": 0.3040264844894409, | |
| "sampling/importance_sampling_ratio/min": 0.0008875139756128192, | |
| "sampling/sampling_logp_difference/max": 2.6261608600616455, | |
| "sampling/sampling_logp_difference/mean": 0.7000038623809814, | |
| "step": 213, | |
| "step_time": 8.743301202000112 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.459619641304016, | |
| "epoch": 0.00214, | |
| "grad_norm": 0.010844763368368149, | |
| "kl": 0.6048287376761436, | |
| "learning_rate": 9.999996334805946e-06, | |
| "loss": -0.0063, | |
| "step": 214, | |
| "step_time": 4.693870484000399 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 839.0, | |
| "completions/max_terminated_length": 839.0, | |
| "completions/mean_length": 359.09375, | |
| "completions/mean_terminated_length": 370.1612854003906, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.843548268079758, | |
| "epoch": 0.00215, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.047307442873716354, | |
| "kl": 1.0721620507538319, | |
| "learning_rate": 9.999996293508317e-06, | |
| "loss": -0.012, | |
| "num_tokens": 5670482.0, | |
| "reward": 2.6061129570007324, | |
| "reward_std": 0.6701341867446899, | |
| "rewards/rollout_reward_func/mean": 2.6061129570007324, | |
| "rewards/rollout_reward_func/std": 1.2390384674072266, | |
| "sampling/importance_sampling_ratio/max": 0.5575221180915833, | |
| "sampling/importance_sampling_ratio/mean": 0.2627297639846802, | |
| "sampling/importance_sampling_ratio/min": 5.8882815068272976e-08, | |
| "sampling/sampling_logp_difference/max": 2.7387423515319824, | |
| "sampling/sampling_logp_difference/mean": 0.7991349101066589, | |
| "step": 215, | |
| "step_time": 9.773670517000028 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.866431772708893, | |
| "epoch": 0.00216, | |
| "grad_norm": 0.02840716764330864, | |
| "kl": 0.9429246261715889, | |
| "learning_rate": 9.999996251979329e-06, | |
| "loss": -0.0121, | |
| "step": 216, | |
| "step_time": 5.003656359999695 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 715.0, | |
| "completions/max_terminated_length": 715.0, | |
| "completions/mean_length": 386.65625, | |
| "completions/mean_terminated_length": 386.65625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.9683555364608765, | |
| "epoch": 0.00217, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.056511927396059036, | |
| "kl": 0.435198824852705, | |
| "learning_rate": 9.999996210218981e-06, | |
| "loss": -0.0082, | |
| "num_tokens": 5723431.0, | |
| "reward": 2.3611326217651367, | |
| "reward_std": 1.23284113407135, | |
| "rewards/rollout_reward_func/mean": 2.3611326217651367, | |
| "rewards/rollout_reward_func/std": 2.1599881649017334, | |
| "sampling/importance_sampling_ratio/max": 0.5584315657615662, | |
| "sampling/importance_sampling_ratio/mean": 0.23213379085063934, | |
| "sampling/importance_sampling_ratio/min": 9.592416063242126e-06, | |
| "sampling/sampling_logp_difference/max": 3.0070853233337402, | |
| "sampling/sampling_logp_difference/mean": 0.7948013544082642, | |
| "step": 217, | |
| "step_time": 8.342223236000336 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.010416666977107525, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010416666977107525, | |
| "entropy": 4.992517560720444, | |
| "epoch": 0.00218, | |
| "grad_norm": 0.011323577724397182, | |
| "kl": 0.43109437450766563, | |
| "learning_rate": 9.999996168227277e-06, | |
| "loss": -0.0084, | |
| "step": 218, | |
| "step_time": 4.607645754999339 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 832.0, | |
| "completions/max_terminated_length": 832.0, | |
| "completions/mean_length": 301.9375, | |
| "completions/mean_terminated_length": 301.9375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 4.874771684408188, | |
| "epoch": 0.00219, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.031172219663858414, | |
| "kl": 0.6132681779563427, | |
| "learning_rate": 9.999996126004213e-06, | |
| "loss": -0.0065, | |
| "num_tokens": 5770701.0, | |
| "reward": 2.9851279258728027, | |
| "reward_std": 0.8050676584243774, | |
| "rewards/rollout_reward_func/mean": 2.9851279258728027, | |
| "rewards/rollout_reward_func/std": 1.344331979751587, | |
| "sampling/importance_sampling_ratio/max": 0.5544815063476562, | |
| "sampling/importance_sampling_ratio/mean": 0.24593959748744965, | |
| "sampling/importance_sampling_ratio/min": 4.177666784559164e-14, | |
| "sampling/sampling_logp_difference/max": 4.385872840881348, | |
| "sampling/sampling_logp_difference/mean": 0.8459481000900269, | |
| "step": 219, | |
| "step_time": 8.750391007999951 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 4.893818587064743, | |
| "epoch": 0.0022, | |
| "grad_norm": 0.03290743753314018, | |
| "kl": 0.6108976900577545, | |
| "learning_rate": 9.999996083549788e-06, | |
| "loss": -0.0065, | |
| "step": 220, | |
| "step_time": 4.7656036020011925 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00657894741743803, | |
| "clip_ratio/high_mean": 0.003289473708719015, | |
| "clip_ratio/low_mean": 0.0062500000931322575, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.009539473801851273, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 778.0, | |
| "completions/max_terminated_length": 778.0, | |
| "completions/mean_length": 569.1875, | |
| "completions/mean_terminated_length": 569.1875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.575940668582916, | |
| "epoch": 0.00221, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011595762334764004, | |
| "kl": 0.347899217158556, | |
| "learning_rate": 9.999996040864003e-06, | |
| "loss": -0.0171, | |
| "num_tokens": 5830699.0, | |
| "reward": 3.000262498855591, | |
| "reward_std": 1.67076575756073, | |
| "rewards/rollout_reward_func/mean": 3.000262498855591, | |
| "rewards/rollout_reward_func/std": 1.8007228374481201, | |
| "sampling/importance_sampling_ratio/max": 0.3085794150829315, | |
| "sampling/importance_sampling_ratio/mean": 0.15168991684913635, | |
| "sampling/importance_sampling_ratio/min": 4.0518877315876e-12, | |
| "sampling/sampling_logp_difference/max": 2.8902509212493896, | |
| "sampling/sampling_logp_difference/mean": 0.9849369525909424, | |
| "step": 221, | |
| "step_time": 9.164814059999571 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0062500000931322575, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0062500000931322575, | |
| "entropy": 5.600629568099976, | |
| "epoch": 0.00222, | |
| "grad_norm": 0.010892020538449287, | |
| "kl": 0.3483094722032547, | |
| "learning_rate": 9.999995997946861e-06, | |
| "loss": -0.0171, | |
| "step": 222, | |
| "step_time": 4.744326772000022 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0028409091755747795, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 671.0, | |
| "completions/max_terminated_length": 671.0, | |
| "completions/mean_length": 410.9375, | |
| "completions/mean_terminated_length": 410.9375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.076228618621826, | |
| "epoch": 0.00223, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.007272793911397457, | |
| "kl": 0.37392777390778065, | |
| "learning_rate": 9.999995954798361e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 5882689.0, | |
| "reward": 2.952998161315918, | |
| "reward_std": 1.2456332445144653, | |
| "rewards/rollout_reward_func/mean": 2.952998161315918, | |
| "rewards/rollout_reward_func/std": 1.8146508932113647, | |
| "sampling/importance_sampling_ratio/max": 0.5566303133964539, | |
| "sampling/importance_sampling_ratio/mean": 0.1916414350271225, | |
| "sampling/importance_sampling_ratio/min": 5.414286691375744e-15, | |
| "sampling/sampling_logp_difference/max": 3.779116630554199, | |
| "sampling/sampling_logp_difference/mean": 1.1832430362701416, | |
| "step": 223, | |
| "step_time": 8.38111467600038 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.1000664830207825, | |
| "epoch": 0.00224, | |
| "grad_norm": 0.00639992905780673, | |
| "kl": 0.37375164218246937, | |
| "learning_rate": 9.9999959114185e-06, | |
| "loss": -0.0002, | |
| "step": 224, | |
| "step_time": 5.022103202999915 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 751.0, | |
| "completions/max_terminated_length": 751.0, | |
| "completions/mean_length": 251.3125, | |
| "completions/mean_terminated_length": 258.9032287597656, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.927689909934998, | |
| "epoch": 0.00225, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027866557240486145, | |
| "kl": 0.44809701666235924, | |
| "learning_rate": 9.999995867807281e-06, | |
| "loss": -0.0124, | |
| "num_tokens": 5928498.0, | |
| "reward": 2.4831409454345703, | |
| "reward_std": 0.8753278255462646, | |
| "rewards/rollout_reward_func/mean": 2.4831409454345703, | |
| "rewards/rollout_reward_func/std": 1.5412579774856567, | |
| "sampling/importance_sampling_ratio/max": 0.5537245869636536, | |
| "sampling/importance_sampling_ratio/mean": 0.21668727695941925, | |
| "sampling/importance_sampling_ratio/min": 7.331964479995179e-10, | |
| "sampling/sampling_logp_difference/max": 3.372708797454834, | |
| "sampling/sampling_logp_difference/mean": 1.118770956993103, | |
| "step": 225, | |
| "step_time": 8.150611867999942 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 400000, | |
| "num_input_tokens_seen": 5928498, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |