Instructions to use mjf-su/GRPO-Model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mjf-su/GRPO-Model with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="mjf-su/GRPO-Model") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForMultimodalLM processor = AutoProcessor.from_pretrained("mjf-su/GRPO-Model") model = AutoModelForMultimodalLM.from_pretrained("mjf-su/GRPO-Model") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use mjf-su/GRPO-Model with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "mjf-su/GRPO-Model" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mjf-su/GRPO-Model", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/mjf-su/GRPO-Model
- SGLang
How to use mjf-su/GRPO-Model with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "mjf-su/GRPO-Model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mjf-su/GRPO-Model", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "mjf-su/GRPO-Model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mjf-su/GRPO-Model", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use mjf-su/GRPO-Model with Docker Model Runner:
docker model run hf.co/mjf-su/GRPO-Model
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.1251500085719184, | |
| "eval_steps": 500, | |
| "global_step": 730, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009895833674818277, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 921.2, | |
| "completions/mean_length": 747.1151245117187, | |
| "completions/mean_terminated_length": 744.3689392089843, | |
| "completions/min_length": 457.8, | |
| "completions/min_terminated_length": 457.8, | |
| "entropy": 0.7938701828320821, | |
| "epoch": 0.0017143836790673753, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 4482493.0, | |
| "reward": -11.88613681793213, | |
| "reward_std": 9.975114345550537, | |
| "rewards/ADERawReward/mean": -11.914521789550781, | |
| "rewards/ADERawReward/std": 9.975466585159301, | |
| "rewards/StrictFormatReward/mean": 0.28385417386889455, | |
| "rewards/StrictFormatReward/std": 2.689612889289856, | |
| "sampling/importance_sampling_ratio/max": 2.7710394859313965, | |
| "sampling/importance_sampling_ratio/mean": 0.38329996466636657, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.348051142692566, | |
| "sampling/sampling_logp_difference/mean": 0.02634742669761181, | |
| "step": 10, | |
| "step_time": 31.851699317700696 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.014062500465661288, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 901.0, | |
| "completions/mean_length": 746.5093994140625, | |
| "completions/mean_terminated_length": 742.5637084960938, | |
| "completions/min_length": 319.8, | |
| "completions/min_terminated_length": 319.8, | |
| "entropy": 0.783385141690572, | |
| "epoch": 0.0034287673581347507, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0167, | |
| "num_tokens": 8963695.0, | |
| "reward": -10.739660549163819, | |
| "reward_std": 7.596912527084351, | |
| "rewards/ADERawReward/mean": -10.76804552078247, | |
| "rewards/ADERawReward/std": 7.586498594284057, | |
| "rewards/StrictFormatReward/mean": 0.2838541708886623, | |
| "rewards/StrictFormatReward/std": 2.69962215423584, | |
| "sampling/importance_sampling_ratio/max": 2.778630328178406, | |
| "sampling/importance_sampling_ratio/mean": 0.38422776758670807, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.3024064064025875, | |
| "sampling/sampling_logp_difference/mean": 0.026079285889863968, | |
| "step": 20, | |
| "step_time": 31.08811371029442 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.01302083358168602, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 916.5, | |
| "completions/mean_length": 745.9713745117188, | |
| "completions/mean_terminated_length": 742.3107482910157, | |
| "completions/min_length": 344.2, | |
| "completions/min_terminated_length": 344.2, | |
| "entropy": 0.777154250939687, | |
| "epoch": 0.005143151037202126, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0033, | |
| "num_tokens": 13443080.0, | |
| "reward": -10.726643276214599, | |
| "reward_std": 6.941620349884033, | |
| "rewards/ADERawReward/mean": -10.759612274169921, | |
| "rewards/ADERawReward/std": 6.939747667312622, | |
| "rewards/StrictFormatReward/mean": 0.32968750670552255, | |
| "rewards/StrictFormatReward/std": 2.6174988746643066, | |
| "sampling/importance_sampling_ratio/max": 2.820967411994934, | |
| "sampling/importance_sampling_ratio/mean": 0.3970964789390564, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.230282878875732, | |
| "sampling/sampling_logp_difference/mean": 0.025640238262712955, | |
| "step": 30, | |
| "step_time": 31.281337887697738 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000186264515, | |
| "completions/max_length": 1006.1, | |
| "completions/max_terminated_length": 952.6, | |
| "completions/mean_length": 746.8578247070312, | |
| "completions/mean_terminated_length": 745.1074523925781, | |
| "completions/min_length": 574.6, | |
| "completions/min_terminated_length": 574.6, | |
| "entropy": 0.7786477545897166, | |
| "epoch": 0.006857534716269501, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0052, | |
| "num_tokens": 17925079.0, | |
| "reward": -10.400494289398193, | |
| "reward_std": 8.363166570663452, | |
| "rewards/ADERawReward/mean": -10.453514766693115, | |
| "rewards/ADERawReward/std": 8.35603289604187, | |
| "rewards/StrictFormatReward/mean": 0.5302083313465118, | |
| "rewards/StrictFormatReward/std": 2.1997122406959533, | |
| "sampling/importance_sampling_ratio/max": 2.7520124673843385, | |
| "sampling/importance_sampling_ratio/mean": 0.4182109236717224, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.882514882087707, | |
| "sampling/sampling_logp_difference/mean": 0.025688859820365905, | |
| "step": 40, | |
| "step_time": 30.94830545460136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010416666883975268, | |
| "completions/max_length": 1008.7, | |
| "completions/max_terminated_length": 908.0, | |
| "completions/mean_length": 744.6708557128907, | |
| "completions/mean_terminated_length": 741.7320251464844, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "entropy": 0.7797636012236278, | |
| "epoch": 0.008571918395336876, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 22402927.0, | |
| "reward": -11.912008953094482, | |
| "reward_std": 25.465690279006957, | |
| "rewards/ADERawReward/mean": -11.948414325714111, | |
| "rewards/ADERawReward/std": 25.452851724624633, | |
| "rewards/StrictFormatReward/mean": 0.3640625074505806, | |
| "rewards/StrictFormatReward/std": 2.5515830755233764, | |
| "sampling/importance_sampling_ratio/max": 2.7972999811172485, | |
| "sampling/importance_sampling_ratio/mean": 0.41742126941680907, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.1138537406921385, | |
| "sampling/sampling_logp_difference/mean": 0.025952593609690666, | |
| "step": 50, | |
| "step_time": 30.99103705089656 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00572916679084301, | |
| "completions/max_length": 982.4, | |
| "completions/max_terminated_length": 894.4, | |
| "completions/mean_length": 743.1708618164063, | |
| "completions/mean_terminated_length": 741.5553588867188, | |
| "completions/min_length": 389.5, | |
| "completions/min_terminated_length": 389.5, | |
| "entropy": 0.7705018659432729, | |
| "epoch": 0.010286302074404252, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.001, | |
| "num_tokens": 26877415.0, | |
| "reward": -10.354321956634521, | |
| "reward_std": 7.085966682434082, | |
| "rewards/ADERawReward/mean": -10.3998948097229, | |
| "rewards/ADERawReward/std": 7.078661155700684, | |
| "rewards/StrictFormatReward/mean": 0.45572916492819787, | |
| "rewards/StrictFormatReward/std": 2.357165718078613, | |
| "sampling/importance_sampling_ratio/max": 2.8156795501708984, | |
| "sampling/importance_sampling_ratio/mean": 0.4263548344373703, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.213912606239319, | |
| "sampling/sampling_logp_difference/mean": 0.025748718157410623, | |
| "step": 60, | |
| "step_time": 30.629806772600567 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500139698387, | |
| "completions/max_length": 997.5, | |
| "completions/max_terminated_length": 901.5, | |
| "completions/mean_length": 745.1635559082031, | |
| "completions/mean_terminated_length": 742.9772766113281, | |
| "completions/min_length": 448.0, | |
| "completions/min_terminated_length": 448.0, | |
| "entropy": 0.7808759689331055, | |
| "epoch": 0.012000685753471628, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0022, | |
| "num_tokens": 31355793.0, | |
| "reward": -10.355865859985352, | |
| "reward_std": 7.629749822616577, | |
| "rewards/ADERawReward/mean": -10.399147129058838, | |
| "rewards/ADERawReward/std": 7.623885774612427, | |
| "rewards/StrictFormatReward/mean": 0.43281250447034836, | |
| "rewards/StrictFormatReward/std": 2.4088792324066164, | |
| "sampling/importance_sampling_ratio/max": 2.7858126163482666, | |
| "sampling/importance_sampling_ratio/mean": 0.4104765444993973, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.654694700241089, | |
| "sampling/sampling_logp_difference/mean": 0.02596179526299238, | |
| "step": 70, | |
| "step_time": 30.90157012869895 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000139698386, | |
| "completions/max_length": 997.0, | |
| "completions/max_terminated_length": 911.7, | |
| "completions/mean_length": 743.9010620117188, | |
| "completions/mean_terminated_length": 742.1458801269531, | |
| "completions/min_length": 506.5, | |
| "completions/min_terminated_length": 506.5, | |
| "entropy": 0.7690683722496032, | |
| "epoch": 0.013715069432539003, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0147, | |
| "num_tokens": 35831699.0, | |
| "reward": -9.603075504302979, | |
| "reward_std": 7.397923803329467, | |
| "rewards/ADERawReward/mean": -9.64406452178955, | |
| "rewards/ADERawReward/std": 7.394052314758301, | |
| "rewards/StrictFormatReward/mean": 0.40989583656191825, | |
| "rewards/StrictFormatReward/std": 2.446663808822632, | |
| "sampling/importance_sampling_ratio/max": 2.8509432315826415, | |
| "sampling/importance_sampling_ratio/mean": 0.42068196535110475, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.871529459953308, | |
| "sampling/sampling_logp_difference/mean": 0.025504560954868794, | |
| "step": 80, | |
| "step_time": 30.775000178898335 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006770833488553762, | |
| "completions/max_length": 1018.6, | |
| "completions/max_terminated_length": 921.4, | |
| "completions/mean_length": 743.0687744140625, | |
| "completions/mean_terminated_length": 741.1454040527344, | |
| "completions/min_length": 413.2, | |
| "completions/min_terminated_length": 413.2, | |
| "entropy": 0.7730992416540782, | |
| "epoch": 0.015429453111606378, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0209, | |
| "num_tokens": 40306439.0, | |
| "reward": -9.999512577056885, | |
| "reward_std": 6.795107650756836, | |
| "rewards/ADERawReward/mean": -10.04222068786621, | |
| "rewards/ADERawReward/std": 6.785987043380738, | |
| "rewards/StrictFormatReward/mean": 0.4270833358168602, | |
| "rewards/StrictFormatReward/std": 2.427087187767029, | |
| "sampling/importance_sampling_ratio/max": 2.806507611274719, | |
| "sampling/importance_sampling_ratio/mean": 0.41283826231956483, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.893343424797058, | |
| "sampling/sampling_logp_difference/mean": 0.02597447969019413, | |
| "step": 90, | |
| "step_time": 31.03075063330034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000139698386, | |
| "completions/max_length": 989.1, | |
| "completions/max_terminated_length": 911.9, | |
| "completions/mean_length": 744.7958557128907, | |
| "completions/mean_terminated_length": 743.0538269042969, | |
| "completions/min_length": 479.0, | |
| "completions/min_terminated_length": 479.0, | |
| "entropy": 0.7617896695931753, | |
| "epoch": 0.01714383679067375, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 44785151.0, | |
| "reward": -10.150122165679932, | |
| "reward_std": 6.947455310821534, | |
| "rewards/ADERawReward/mean": -10.1985595703125, | |
| "rewards/ADERawReward/std": 6.9321370124816895, | |
| "rewards/StrictFormatReward/mean": 0.48437499925494193, | |
| "rewards/StrictFormatReward/std": 2.2780819296836854, | |
| "sampling/importance_sampling_ratio/max": 2.8471882343292236, | |
| "sampling/importance_sampling_ratio/mean": 0.4186240643262863, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.192489242553711, | |
| "sampling/sampling_logp_difference/mean": 0.025915385968983174, | |
| "step": 100, | |
| "step_time": 30.777787425796852 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008854166837409139, | |
| "completions/max_length": 1016.1, | |
| "completions/max_terminated_length": 927.6, | |
| "completions/mean_length": 747.430224609375, | |
| "completions/mean_terminated_length": 744.9568664550782, | |
| "completions/min_length": 542.5, | |
| "completions/min_terminated_length": 542.5, | |
| "entropy": 0.7839122792085012, | |
| "epoch": 0.018858220469741126, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 49267497.0, | |
| "reward": -10.34236192703247, | |
| "reward_std": 6.522932863235473, | |
| "rewards/ADERawReward/mean": -10.377049255371094, | |
| "rewards/ADERawReward/std": 6.512392950057984, | |
| "rewards/StrictFormatReward/mean": 0.3468750029802322, | |
| "rewards/StrictFormatReward/std": 2.5984415769577027, | |
| "sampling/importance_sampling_ratio/max": 2.919663596153259, | |
| "sampling/importance_sampling_ratio/mean": 0.42760642170906066, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.582067346572876, | |
| "sampling/sampling_logp_difference/mean": 0.025711694732308388, | |
| "step": 110, | |
| "step_time": 31.13533423260669 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000139698386, | |
| "completions/max_length": 981.8, | |
| "completions/max_terminated_length": 905.6, | |
| "completions/mean_length": 741.3906433105469, | |
| "completions/mean_terminated_length": 739.6170837402344, | |
| "completions/min_length": 322.0, | |
| "completions/min_terminated_length": 322.0, | |
| "entropy": 0.7655617932478587, | |
| "epoch": 0.020572604148808505, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0003, | |
| "num_tokens": 53739095.0, | |
| "reward": -9.396285820007325, | |
| "reward_std": 6.019436550140381, | |
| "rewards/ADERawReward/mean": -9.43211908340454, | |
| "rewards/ADERawReward/std": 5.99921350479126, | |
| "rewards/StrictFormatReward/mean": 0.35833333134651185, | |
| "rewards/StrictFormatReward/std": 2.565337634086609, | |
| "sampling/importance_sampling_ratio/max": 2.7736236810684205, | |
| "sampling/importance_sampling_ratio/mean": 0.4329004347324371, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.103514838218689, | |
| "sampling/sampling_logp_difference/mean": 0.02581946086138487, | |
| "step": 120, | |
| "step_time": 30.675611695800036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000139698386, | |
| "completions/max_length": 993.0, | |
| "completions/max_terminated_length": 917.3, | |
| "completions/mean_length": 743.2172119140625, | |
| "completions/mean_terminated_length": 741.4531127929688, | |
| "completions/min_length": 448.6, | |
| "completions/min_terminated_length": 448.6, | |
| "entropy": 0.7609198371569316, | |
| "epoch": 0.02228698782787588, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0186, | |
| "num_tokens": 58213992.0, | |
| "reward": -10.035307788848877, | |
| "reward_std": 7.6791582107543945, | |
| "rewards/ADERawReward/mean": -10.080307579040527, | |
| "rewards/ADERawReward/std": 7.661759281158448, | |
| "rewards/StrictFormatReward/mean": 0.45, | |
| "rewards/StrictFormatReward/std": 2.383046817779541, | |
| "sampling/importance_sampling_ratio/max": 2.7497065782547, | |
| "sampling/importance_sampling_ratio/mean": 0.38673609495162964, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.264748442173004, | |
| "sampling/sampling_logp_difference/mean": 0.02555535715073347, | |
| "step": 130, | |
| "step_time": 30.85125565490016 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008333333535119891, | |
| "completions/max_length": 995.9, | |
| "completions/max_terminated_length": 936.1, | |
| "completions/mean_length": 747.1015808105469, | |
| "completions/mean_terminated_length": 744.7781860351563, | |
| "completions/min_length": 504.1, | |
| "completions/min_terminated_length": 504.1, | |
| "entropy": 0.7680625875790914, | |
| "epoch": 0.024001371506943255, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.014, | |
| "num_tokens": 62696459.0, | |
| "reward": -9.6462965965271, | |
| "reward_std": 6.933882856369019, | |
| "rewards/ADERawReward/mean": -9.687859153747558, | |
| "rewards/ADERawReward/std": 6.932486724853516, | |
| "rewards/StrictFormatReward/mean": 0.41562500298023225, | |
| "rewards/StrictFormatReward/std": 2.435987985134125, | |
| "sampling/importance_sampling_ratio/max": 2.7624801874160765, | |
| "sampling/importance_sampling_ratio/mean": 0.39736475646495817, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.469653701782226, | |
| "sampling/sampling_logp_difference/mean": 0.025803259573876858, | |
| "step": 140, | |
| "step_time": 30.751169363802184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0067708335351198915, | |
| "completions/max_length": 996.8, | |
| "completions/max_terminated_length": 927.2, | |
| "completions/mean_length": 743.0797119140625, | |
| "completions/mean_terminated_length": 741.1633605957031, | |
| "completions/min_length": 364.4, | |
| "completions/min_terminated_length": 364.4, | |
| "entropy": 0.771180788675944, | |
| "epoch": 0.02571575518601063, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0135, | |
| "num_tokens": 67171268.0, | |
| "reward": -9.683474731445312, | |
| "reward_std": 6.344115209579468, | |
| "rewards/ADERawReward/mean": -9.721026802062989, | |
| "rewards/ADERawReward/std": 6.331903743743896, | |
| "rewards/StrictFormatReward/mean": 0.3755208313465118, | |
| "rewards/StrictFormatReward/std": 2.536852979660034, | |
| "sampling/importance_sampling_ratio/max": 2.6906844854354857, | |
| "sampling/importance_sampling_ratio/mean": 0.3939074516296387, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.591675233840943, | |
| "sampling/sampling_logp_difference/mean": 0.026031140238046646, | |
| "step": 150, | |
| "step_time": 30.759532438102177 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009375000186264516, | |
| "completions/max_length": 1001.1, | |
| "completions/max_terminated_length": 928.5, | |
| "completions/mean_length": 743.9906616210938, | |
| "completions/mean_terminated_length": 741.3461547851563, | |
| "completions/min_length": 396.0, | |
| "completions/min_terminated_length": 396.0, | |
| "entropy": 0.772610588868459, | |
| "epoch": 0.027430138865078006, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.006, | |
| "num_tokens": 71648274.0, | |
| "reward": -9.378812217712403, | |
| "reward_std": 6.656644868850708, | |
| "rewards/ADERawReward/mean": -9.419801712036133, | |
| "rewards/ADERawReward/std": 6.638844442367554, | |
| "rewards/StrictFormatReward/mean": 0.4098958343267441, | |
| "rewards/StrictFormatReward/std": 2.457722854614258, | |
| "sampling/importance_sampling_ratio/max": 2.8564789295196533, | |
| "sampling/importance_sampling_ratio/mean": 0.3868180692195892, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.245336532592773, | |
| "sampling/sampling_logp_difference/mean": 0.026049494743347168, | |
| "step": 160, | |
| "step_time": 31.00511598830053 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00572916679084301, | |
| "completions/max_length": 990.5, | |
| "completions/max_terminated_length": 943.8, | |
| "completions/mean_length": 742.833349609375, | |
| "completions/mean_terminated_length": 741.2216247558594, | |
| "completions/min_length": 507.2, | |
| "completions/min_terminated_length": 507.2, | |
| "entropy": 0.7669690032800038, | |
| "epoch": 0.02914452254414538, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0046, | |
| "num_tokens": 76122162.0, | |
| "reward": -9.605478668212891, | |
| "reward_std": 5.803140306472779, | |
| "rewards/ADERawReward/mean": -9.645895195007324, | |
| "rewards/ADERawReward/std": 5.7928056716918945, | |
| "rewards/StrictFormatReward/mean": 0.4041666634380817, | |
| "rewards/StrictFormatReward/std": 2.4726577758789063, | |
| "sampling/importance_sampling_ratio/max": 2.71953125, | |
| "sampling/importance_sampling_ratio/mean": 0.4068588227033615, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.668109703063965, | |
| "sampling/sampling_logp_difference/mean": 0.025866417028009892, | |
| "step": 170, | |
| "step_time": 30.801190392104036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0067708335351198915, | |
| "completions/max_length": 992.8, | |
| "completions/max_terminated_length": 902.9, | |
| "completions/mean_length": 745.7635559082031, | |
| "completions/mean_terminated_length": 743.8595642089844, | |
| "completions/min_length": 539.9, | |
| "completions/min_terminated_length": 539.9, | |
| "entropy": 0.7681126793225607, | |
| "epoch": 0.030858906223212756, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0009, | |
| "num_tokens": 80601532.0, | |
| "reward": -10.118904399871827, | |
| "reward_std": 6.3121555805206295, | |
| "rewards/ADERawReward/mean": -10.168487930297852, | |
| "rewards/ADERawReward/std": 6.31596007347107, | |
| "rewards/StrictFormatReward/mean": 0.49583332538604735, | |
| "rewards/StrictFormatReward/std": 2.2922281503677366, | |
| "sampling/importance_sampling_ratio/max": 2.808049750328064, | |
| "sampling/importance_sampling_ratio/mean": 0.40117439031600954, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.4242959260940555, | |
| "sampling/sampling_logp_difference/mean": 0.026261823065578938, | |
| "step": 180, | |
| "step_time": 30.95573297930241 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009375000186264516, | |
| "completions/max_length": 1006.3, | |
| "completions/max_terminated_length": 940.1, | |
| "completions/mean_length": 745.2932495117187, | |
| "completions/mean_terminated_length": 742.6480163574219, | |
| "completions/min_length": 493.3, | |
| "completions/min_terminated_length": 493.3, | |
| "entropy": 0.7715065141518911, | |
| "epoch": 0.03257328990228013, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0167, | |
| "num_tokens": 85080239.0, | |
| "reward": -9.032559633255005, | |
| "reward_std": 5.6337450504302975, | |
| "rewards/ADERawReward/mean": -9.070111560821534, | |
| "rewards/ADERawReward/std": 5.615639925003052, | |
| "rewards/StrictFormatReward/mean": 0.3755208343267441, | |
| "rewards/StrictFormatReward/std": 2.532916522026062, | |
| "sampling/importance_sampling_ratio/max": 2.906028723716736, | |
| "sampling/importance_sampling_ratio/mean": 0.43047145903110506, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.560648036003113, | |
| "sampling/sampling_logp_difference/mean": 0.02589159458875656, | |
| "step": 190, | |
| "step_time": 31.04905029379588 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.004687500139698386, | |
| "completions/max_length": 967.5, | |
| "completions/max_terminated_length": 901.5, | |
| "completions/mean_length": 743.9588745117187, | |
| "completions/mean_terminated_length": 742.6530639648438, | |
| "completions/min_length": 482.7, | |
| "completions/min_terminated_length": 482.7, | |
| "entropy": 0.7573511083920796, | |
| "epoch": 0.0342876735813475, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0071, | |
| "num_tokens": 89554992.0, | |
| "reward": -9.594873905181885, | |
| "reward_std": 7.4608996391296385, | |
| "rewards/ADERawReward/mean": -9.645030212402343, | |
| "rewards/ADERawReward/std": 7.4621100425720215, | |
| "rewards/StrictFormatReward/mean": 0.5015625014901162, | |
| "rewards/StrictFormatReward/std": 2.2230212569236754, | |
| "sampling/importance_sampling_ratio/max": 2.762194800376892, | |
| "sampling/importance_sampling_ratio/mean": 0.3942416876554489, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.715522122383118, | |
| "sampling/sampling_logp_difference/mean": 0.02548809293657541, | |
| "step": 200, | |
| "step_time": 30.452962850301994 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008854166883975268, | |
| "completions/max_length": 1002.2, | |
| "completions/max_terminated_length": 908.9, | |
| "completions/mean_length": 749.1671997070313, | |
| "completions/mean_terminated_length": 746.7061828613281, | |
| "completions/min_length": 479.3, | |
| "completions/min_terminated_length": 479.3, | |
| "entropy": 0.7769242246945699, | |
| "epoch": 0.03600205726041488, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0264, | |
| "num_tokens": 94041409.0, | |
| "reward": -9.6312180519104, | |
| "reward_std": 5.488570928573608, | |
| "rewards/ADERawReward/mean": -9.677363443374634, | |
| "rewards/ADERawReward/std": 5.482590246200561, | |
| "rewards/StrictFormatReward/mean": 0.46145834028720856, | |
| "rewards/StrictFormatReward/std": 2.369260883331299, | |
| "sampling/importance_sampling_ratio/max": 2.827503228187561, | |
| "sampling/importance_sampling_ratio/mean": 0.39889043271541597, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.615737795829773, | |
| "sampling/sampling_logp_difference/mean": 0.02642081268131733, | |
| "step": 210, | |
| "step_time": 31.002005700898007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007291666883975268, | |
| "completions/max_length": 1011.2, | |
| "completions/max_terminated_length": 917.8, | |
| "completions/mean_length": 744.7901306152344, | |
| "completions/mean_terminated_length": 742.7363525390625, | |
| "completions/min_length": 326.2, | |
| "completions/min_terminated_length": 326.2, | |
| "entropy": 0.7611714641253153, | |
| "epoch": 0.03771644093948225, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0072, | |
| "num_tokens": 98520158.0, | |
| "reward": -9.730458450317382, | |
| "reward_std": 6.581094980239868, | |
| "rewards/ADERawReward/mean": -9.778323078155518, | |
| "rewards/ADERawReward/std": 6.570281600952148, | |
| "rewards/StrictFormatReward/mean": 0.478645833581686, | |
| "rewards/StrictFormatReward/std": 2.319968819618225, | |
| "sampling/importance_sampling_ratio/max": 2.9045305013656617, | |
| "sampling/importance_sampling_ratio/mean": 0.4044792056083679, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.901871252059936, | |
| "sampling/sampling_logp_difference/mean": 0.026320008002221583, | |
| "step": 220, | |
| "step_time": 31.057802741799968 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00833333358168602, | |
| "completions/max_length": 1006.6, | |
| "completions/max_terminated_length": 907.8, | |
| "completions/mean_length": 744.116162109375, | |
| "completions/mean_terminated_length": 741.7729553222656, | |
| "completions/min_length": 364.6, | |
| "completions/min_terminated_length": 364.6, | |
| "entropy": 0.7634056508541107, | |
| "epoch": 0.03943082461854963, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 102995469.0, | |
| "reward": -9.594787311553954, | |
| "reward_std": 6.4190247535705565, | |
| "rewards/ADERawReward/mean": -9.632339668273925, | |
| "rewards/ADERawReward/std": 6.41018123626709, | |
| "rewards/StrictFormatReward/mean": 0.37552083283662796, | |
| "rewards/StrictFormatReward/std": 2.4954795360565187, | |
| "sampling/importance_sampling_ratio/max": 2.7927833795547485, | |
| "sampling/importance_sampling_ratio/mean": 0.38037638664245604, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.303525161743164, | |
| "sampling/sampling_logp_difference/mean": 0.026079374738037585, | |
| "step": 230, | |
| "step_time": 30.925862592204066 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625000232830644, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 922.7, | |
| "completions/mean_length": 747.7349060058593, | |
| "completions/mean_terminated_length": 743.3495300292968, | |
| "completions/min_length": 425.3, | |
| "completions/min_terminated_length": 425.3, | |
| "entropy": 0.7614292343457539, | |
| "epoch": 0.04114520829761701, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 107478240.0, | |
| "reward": -9.159432983398437, | |
| "reward_std": 5.858185243606568, | |
| "rewards/ADERawReward/mean": -9.192401695251466, | |
| "rewards/ADERawReward/std": 5.8467125415802, | |
| "rewards/StrictFormatReward/mean": 0.3296875016763806, | |
| "rewards/StrictFormatReward/std": 2.6086880922317506, | |
| "sampling/importance_sampling_ratio/max": 2.8600486755371093, | |
| "sampling/importance_sampling_ratio/mean": 0.38983087837696073, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.5773406505584715, | |
| "sampling/sampling_logp_difference/mean": 0.026277855411171912, | |
| "step": 240, | |
| "step_time": 31.37704919550306 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.011458333488553763, | |
| "completions/max_length": 1012.8, | |
| "completions/max_terminated_length": 905.7, | |
| "completions/mean_length": 743.3661682128907, | |
| "completions/mean_terminated_length": 740.1103088378907, | |
| "completions/min_length": 399.4, | |
| "completions/min_terminated_length": 399.4, | |
| "entropy": 0.753006378809611, | |
| "epoch": 0.04285959197668438, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0194, | |
| "num_tokens": 111952815.0, | |
| "reward": -9.348959684371948, | |
| "reward_std": 5.709445333480835, | |
| "rewards/ADERawReward/mean": -9.387657499313354, | |
| "rewards/ADERawReward/std": 5.683103704452515, | |
| "rewards/StrictFormatReward/mean": 0.38697916716337205, | |
| "rewards/StrictFormatReward/std": 2.5156437873840334, | |
| "sampling/importance_sampling_ratio/max": 2.7942948818206785, | |
| "sampling/importance_sampling_ratio/mean": 0.3898850232362747, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.9041135787963865, | |
| "sampling/sampling_logp_difference/mean": 0.026003769040107726, | |
| "step": 250, | |
| "step_time": 31.078463642198766 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.016145833814516664, | |
| "completions/max_length": 1017.2, | |
| "completions/max_terminated_length": 912.4, | |
| "completions/mean_length": 749.7567993164063, | |
| "completions/mean_terminated_length": 745.2748840332031, | |
| "completions/min_length": 587.3, | |
| "completions/min_terminated_length": 587.3, | |
| "entropy": 0.7615062018235524, | |
| "epoch": 0.04457397565575176, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0051, | |
| "num_tokens": 116440044.0, | |
| "reward": -9.92240343093872, | |
| "reward_std": 5.772085857391358, | |
| "rewards/ADERawReward/mean": -9.958809566497802, | |
| "rewards/ADERawReward/std": 5.756062030792236, | |
| "rewards/StrictFormatReward/mean": 0.36406250670552254, | |
| "rewards/StrictFormatReward/std": 2.54167058467865, | |
| "sampling/importance_sampling_ratio/max": 2.7225013256072996, | |
| "sampling/importance_sampling_ratio/mean": 0.3842493683099747, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.781629824638367, | |
| "sampling/sampling_logp_difference/mean": 0.02624006476253271, | |
| "step": 260, | |
| "step_time": 31.103271031705663 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008333333535119891, | |
| "completions/max_length": 998.2, | |
| "completions/max_terminated_length": 911.1, | |
| "completions/mean_length": 743.7968994140625, | |
| "completions/mean_terminated_length": 741.4540466308594, | |
| "completions/min_length": 510.3, | |
| "completions/min_terminated_length": 510.3, | |
| "entropy": 0.7562108953793844, | |
| "epoch": 0.04628835933481913, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0247, | |
| "num_tokens": 120916390.0, | |
| "reward": -9.01723370552063, | |
| "reward_std": 5.40621428489685, | |
| "rewards/ADERawReward/mean": -9.065098094940186, | |
| "rewards/ADERawReward/std": 5.384741115570068, | |
| "rewards/StrictFormatReward/mean": 0.4786458298563957, | |
| "rewards/StrictFormatReward/std": 2.2954741954803466, | |
| "sampling/importance_sampling_ratio/max": 2.789908194541931, | |
| "sampling/importance_sampling_ratio/mean": 0.40749328434467313, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.034530830383301, | |
| "sampling/sampling_logp_difference/mean": 0.026305202022194864, | |
| "step": 270, | |
| "step_time": 30.820387327201026 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009375000139698386, | |
| "completions/max_length": 1011.8, | |
| "completions/max_terminated_length": 914.4, | |
| "completions/mean_length": 745.3156433105469, | |
| "completions/mean_terminated_length": 742.68203125, | |
| "completions/min_length": 492.1, | |
| "completions/min_terminated_length": 492.1, | |
| "entropy": 0.757685911655426, | |
| "epoch": 0.04800274301388651, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "num_tokens": 125395556.0, | |
| "reward": -9.0065682888031, | |
| "reward_std": 5.3750749111175535, | |
| "rewards/ADERawReward/mean": -9.06875557899475, | |
| "rewards/ADERawReward/std": 5.365862655639648, | |
| "rewards/StrictFormatReward/mean": 0.6218749970197678, | |
| "rewards/StrictFormatReward/std": 1.9849600195884705, | |
| "sampling/importance_sampling_ratio/max": 2.791543960571289, | |
| "sampling/importance_sampling_ratio/mean": 0.36947061419487, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.032503914833069, | |
| "sampling/sampling_logp_difference/mean": 0.02653008736670017, | |
| "step": 280, | |
| "step_time": 31.023602066392776 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009375000139698386, | |
| "completions/max_length": 1011.5, | |
| "completions/max_terminated_length": 906.5, | |
| "completions/mean_length": 745.1588745117188, | |
| "completions/mean_terminated_length": 742.5252624511719, | |
| "completions/min_length": 384.9, | |
| "completions/min_terminated_length": 384.9, | |
| "entropy": 0.7569859484831493, | |
| "epoch": 0.04971712669295388, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0025, | |
| "num_tokens": 129874581.0, | |
| "reward": -9.8903018951416, | |
| "reward_std": 5.79907751083374, | |
| "rewards/ADERawReward/mean": -9.936447715759277, | |
| "rewards/ADERawReward/std": 5.782765436172485, | |
| "rewards/StrictFormatReward/mean": 0.4614583358168602, | |
| "rewards/StrictFormatReward/std": 2.3571563005447387, | |
| "sampling/importance_sampling_ratio/max": 2.780865716934204, | |
| "sampling/importance_sampling_ratio/mean": 0.3752612203359604, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.010831832885742, | |
| "sampling/sampling_logp_difference/mean": 0.026486290618777275, | |
| "step": 290, | |
| "step_time": 31.036961890700333 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010937500186264515, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 909.3, | |
| "completions/mean_length": 743.6302307128906, | |
| "completions/mean_terminated_length": 740.5345153808594, | |
| "completions/min_length": 545.1, | |
| "completions/min_terminated_length": 545.1, | |
| "entropy": 0.7479812840620677, | |
| "epoch": 0.05143151037202126, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 134349103.0, | |
| "reward": -8.726549053192139, | |
| "reward_std": 5.281997394561768, | |
| "rewards/ADERawReward/mean": -8.780142164230346, | |
| "rewards/ADERawReward/std": 5.273246812820434, | |
| "rewards/StrictFormatReward/mean": 0.5359374985098839, | |
| "rewards/StrictFormatReward/std": 2.176564943790436, | |
| "sampling/importance_sampling_ratio/max": 2.9104627132415772, | |
| "sampling/importance_sampling_ratio/mean": 0.40789560675621034, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.580249309539795, | |
| "sampling/sampling_logp_difference/mean": 0.025899984315037726, | |
| "step": 300, | |
| "step_time": 31.32498294780089 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500186264514, | |
| "completions/max_length": 998.0, | |
| "completions/max_terminated_length": 917.8, | |
| "completions/mean_length": 745.6083557128907, | |
| "completions/mean_terminated_length": 743.4208374023438, | |
| "completions/min_length": 550.1, | |
| "completions/min_terminated_length": 550.1, | |
| "entropy": 0.7465774556001027, | |
| "epoch": 0.05314589405108863, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0123, | |
| "num_tokens": 138828591.0, | |
| "reward": -9.077164840698241, | |
| "reward_std": 5.992494773864746, | |
| "rewards/ADERawReward/mean": -9.131331491470338, | |
| "rewards/ADERawReward/std": 5.984670972824096, | |
| "rewards/StrictFormatReward/mean": 0.5416666656732559, | |
| "rewards/StrictFormatReward/std": 2.169144892692566, | |
| "sampling/importance_sampling_ratio/max": 2.749341917037964, | |
| "sampling/importance_sampling_ratio/mean": 0.3683441460132599, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.0134562969207765, | |
| "sampling/sampling_logp_difference/mean": 0.026201115176081657, | |
| "step": 310, | |
| "step_time": 30.797605072900478 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.016145833814516664, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 899.7, | |
| "completions/mean_length": 747.6823181152344, | |
| "completions/mean_terminated_length": 743.1548828125, | |
| "completions/min_length": 458.0, | |
| "completions/min_terminated_length": 458.0, | |
| "entropy": 0.7556955059369405, | |
| "epoch": 0.05486027773015601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.006, | |
| "num_tokens": 143312285.0, | |
| "reward": -9.226322269439697, | |
| "reward_std": 6.171311950683593, | |
| "rewards/ADERawReward/mean": -9.265593338012696, | |
| "rewards/ADERawReward/std": 6.160300636291504, | |
| "rewards/StrictFormatReward/mean": 0.39270834550261496, | |
| "rewards/StrictFormatReward/std": 2.4976075172424315, | |
| "sampling/importance_sampling_ratio/max": 2.7495847940444946, | |
| "sampling/importance_sampling_ratio/mean": 0.384751632809639, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.004838466644287, | |
| "sampling/sampling_logp_difference/mean": 0.026612747460603714, | |
| "step": 320, | |
| "step_time": 31.281772803403147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500093132257, | |
| "completions/max_length": 1001.5, | |
| "completions/max_terminated_length": 901.0, | |
| "completions/mean_length": 745.114599609375, | |
| "completions/mean_terminated_length": 742.9226379394531, | |
| "completions/min_length": 415.5, | |
| "completions/min_terminated_length": 415.5, | |
| "entropy": 0.7487649619579315, | |
| "epoch": 0.05657466140922338, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 147791241.0, | |
| "reward": -8.852140951156617, | |
| "reward_std": 6.015402030944824, | |
| "rewards/ADERawReward/mean": -8.904588794708252, | |
| "rewards/ADERawReward/std": 6.00461745262146, | |
| "rewards/StrictFormatReward/mean": 0.5244791731238365, | |
| "rewards/StrictFormatReward/std": 2.198140048980713, | |
| "sampling/importance_sampling_ratio/max": 2.8983518362045286, | |
| "sampling/importance_sampling_ratio/mean": 0.38104148209095, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.405174565315247, | |
| "sampling/sampling_logp_difference/mean": 0.026203346997499467, | |
| "step": 330, | |
| "step_time": 30.943252457803464 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.013541666883975267, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 910.4, | |
| "completions/mean_length": 746.2218994140625, | |
| "completions/mean_terminated_length": 742.4164428710938, | |
| "completions/min_length": 451.8, | |
| "completions/min_terminated_length": 451.8, | |
| "entropy": 0.7482379992802938, | |
| "epoch": 0.05828904508829076, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0044, | |
| "num_tokens": 152271555.0, | |
| "reward": -9.297206592559814, | |
| "reward_std": 5.522917318344116, | |
| "rewards/ADERawReward/mean": -9.338768911361694, | |
| "rewards/ADERawReward/std": 5.507352113723755, | |
| "rewards/StrictFormatReward/mean": 0.4156250014901161, | |
| "rewards/StrictFormatReward/std": 2.437586045265198, | |
| "sampling/importance_sampling_ratio/max": 2.782149076461792, | |
| "sampling/importance_sampling_ratio/mean": 0.37727462947368623, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.424329519271851, | |
| "sampling/sampling_logp_difference/mean": 0.0264679953455925, | |
| "step": 340, | |
| "step_time": 31.394037494297663 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.012500000232830644, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 929.2, | |
| "completions/mean_length": 747.1989868164062, | |
| "completions/mean_terminated_length": 743.6892578125, | |
| "completions/min_length": 461.7, | |
| "completions/min_terminated_length": 461.7, | |
| "entropy": 0.7526971201101939, | |
| "epoch": 0.06000342876735813, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0279, | |
| "num_tokens": 156753761.0, | |
| "reward": -8.87293882369995, | |
| "reward_std": 5.407143402099609, | |
| "rewards/ADERawReward/mean": -8.912782192230225, | |
| "rewards/ADERawReward/std": 5.394438552856445, | |
| "rewards/StrictFormatReward/mean": 0.3984375029802322, | |
| "rewards/StrictFormatReward/std": 2.4990755558013915, | |
| "sampling/importance_sampling_ratio/max": 2.7867547035217286, | |
| "sampling/importance_sampling_ratio/mean": 0.3730757534503937, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.122068333625793, | |
| "sampling/sampling_logp_difference/mean": 0.02654006313532591, | |
| "step": 350, | |
| "step_time": 31.515721038298217 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.012500000325962901, | |
| "completions/max_length": 1007.6, | |
| "completions/max_terminated_length": 894.1, | |
| "completions/mean_length": 747.2932495117187, | |
| "completions/mean_terminated_length": 743.7852661132813, | |
| "completions/min_length": 591.8, | |
| "completions/min_terminated_length": 591.8, | |
| "entropy": 0.7486960550149282, | |
| "epoch": 0.06171781244642551, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0224, | |
| "num_tokens": 161235908.0, | |
| "reward": -9.346630477905274, | |
| "reward_std": 5.943923234939575, | |
| "rewards/ADERawReward/mean": -9.397359657287598, | |
| "rewards/ADERawReward/std": 5.933114051818848, | |
| "rewards/StrictFormatReward/mean": 0.5072916746139526, | |
| "rewards/StrictFormatReward/std": 2.258626651763916, | |
| "sampling/importance_sampling_ratio/max": 2.783932328224182, | |
| "sampling/importance_sampling_ratio/mean": 0.3899604082107544, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.8976672172546385, | |
| "sampling/sampling_logp_difference/mean": 0.02663377095013857, | |
| "step": 360, | |
| "step_time": 31.072972166599357 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010937500232830643, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 907.6, | |
| "completions/mean_length": 743.9771057128906, | |
| "completions/mean_terminated_length": 740.8768371582031, | |
| "completions/min_length": 373.6, | |
| "completions/min_terminated_length": 373.6, | |
| "entropy": 0.7463085273901622, | |
| "epoch": 0.06343219612549289, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0195, | |
| "num_tokens": 165711768.0, | |
| "reward": -9.58823890686035, | |
| "reward_std": 7.584380769729615, | |
| "rewards/ADERawReward/mean": -9.62750973701477, | |
| "rewards/ADERawReward/std": 7.566325521469116, | |
| "rewards/StrictFormatReward/mean": 0.3927083402872086, | |
| "rewards/StrictFormatReward/std": 2.502521538734436, | |
| "sampling/importance_sampling_ratio/max": 2.824156880378723, | |
| "sampling/importance_sampling_ratio/mean": 0.38374905586242675, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.386010980606079, | |
| "sampling/sampling_logp_difference/mean": 0.02665374930948019, | |
| "step": 370, | |
| "step_time": 31.397882584496983 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009895833674818277, | |
| "completions/max_length": 1000.8, | |
| "completions/max_terminated_length": 913.6, | |
| "completions/mean_length": 746.1416870117188, | |
| "completions/mean_terminated_length": 743.3394592285156, | |
| "completions/min_length": 440.4, | |
| "completions/min_terminated_length": 440.4, | |
| "entropy": 0.7471979439258576, | |
| "epoch": 0.06514657980456026, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "num_tokens": 170192072.0, | |
| "reward": -9.293725442886352, | |
| "reward_std": 5.743723678588867, | |
| "rewards/ADERawReward/mean": -9.341016817092896, | |
| "rewards/ADERawReward/std": 5.729398393630982, | |
| "rewards/StrictFormatReward/mean": 0.4729166701436043, | |
| "rewards/StrictFormatReward/std": 2.320163094997406, | |
| "sampling/importance_sampling_ratio/max": 2.869658589363098, | |
| "sampling/importance_sampling_ratio/mean": 0.3773952782154083, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.138100743293762, | |
| "sampling/sampling_logp_difference/mean": 0.026742291823029517, | |
| "step": 380, | |
| "step_time": 30.96988229679846 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.012500000325962901, | |
| "completions/max_length": 1016.4, | |
| "completions/max_terminated_length": 914.7, | |
| "completions/mean_length": 748.4885559082031, | |
| "completions/mean_terminated_length": 744.993017578125, | |
| "completions/min_length": 477.5, | |
| "completions/min_terminated_length": 477.5, | |
| "entropy": 0.7559548437595367, | |
| "epoch": 0.06686096348362763, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0181, | |
| "num_tokens": 174677202.0, | |
| "reward": -9.28088207244873, | |
| "reward_std": 5.592328834533691, | |
| "rewards/ADERawReward/mean": -9.324163150787353, | |
| "rewards/ADERawReward/std": 5.577477645874024, | |
| "rewards/StrictFormatReward/mean": 0.43281249795109034, | |
| "rewards/StrictFormatReward/std": 2.4112179040908814, | |
| "sampling/importance_sampling_ratio/max": 2.825889301300049, | |
| "sampling/importance_sampling_ratio/mean": 0.3906464457511902, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.902947115898132, | |
| "sampling/sampling_logp_difference/mean": 0.026762423664331438, | |
| "step": 390, | |
| "step_time": 31.20840191419411 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500186264514, | |
| "completions/max_length": 992.4, | |
| "completions/max_terminated_length": 934.3, | |
| "completions/mean_length": 745.6031433105469, | |
| "completions/mean_terminated_length": 743.4128295898438, | |
| "completions/min_length": 437.2, | |
| "completions/min_terminated_length": 437.2, | |
| "entropy": 0.7531329174836476, | |
| "epoch": 0.068575347162695, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.008, | |
| "num_tokens": 179156696.0, | |
| "reward": -9.488264322280884, | |
| "reward_std": 5.980498790740967, | |
| "rewards/ADERawReward/mean": -9.5326913356781, | |
| "rewards/ADERawReward/std": 5.96862416267395, | |
| "rewards/StrictFormatReward/mean": 0.44427084028720853, | |
| "rewards/StrictFormatReward/std": 2.4021025896072388, | |
| "sampling/importance_sampling_ratio/max": 2.796902632713318, | |
| "sampling/importance_sampling_ratio/mean": 0.38265294432640073, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.037104201316834, | |
| "sampling/sampling_logp_difference/mean": 0.026842619478702544, | |
| "step": 400, | |
| "step_time": 30.975298491402647 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008854166977107525, | |
| "completions/max_length": 1020.0, | |
| "completions/max_terminated_length": 935.6, | |
| "completions/mean_length": 746.836474609375, | |
| "completions/mean_terminated_length": 744.3670227050782, | |
| "completions/min_length": 503.7, | |
| "completions/min_terminated_length": 503.7, | |
| "entropy": 0.7571933090686798, | |
| "epoch": 0.07028973084176239, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0061, | |
| "num_tokens": 183639214.0, | |
| "reward": -8.974328899383545, | |
| "reward_std": 6.1143780708312985, | |
| "rewards/ADERawReward/mean": -9.019901895523072, | |
| "rewards/ADERawReward/std": 6.102041292190552, | |
| "rewards/StrictFormatReward/mean": 0.45572916865348817, | |
| "rewards/StrictFormatReward/std": 2.3774968147277833, | |
| "sampling/importance_sampling_ratio/max": 2.655148983001709, | |
| "sampling/importance_sampling_ratio/mean": 0.36126451194286346, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.974502754211426, | |
| "sampling/sampling_logp_difference/mean": 0.027187051996588708, | |
| "step": 410, | |
| "step_time": 31.259745221505 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010416666930541395, | |
| "completions/max_length": 1017.3, | |
| "completions/max_terminated_length": 918.2, | |
| "completions/mean_length": 747.1531433105469, | |
| "completions/mean_terminated_length": 744.2319213867188, | |
| "completions/min_length": 565.9, | |
| "completions/min_terminated_length": 565.9, | |
| "entropy": 0.7566624025503794, | |
| "epoch": 0.07200411452082976, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0087, | |
| "num_tokens": 188121876.0, | |
| "reward": -8.923426342010497, | |
| "reward_std": 6.084955930709839, | |
| "rewards/ADERawReward/mean": -8.9724365234375, | |
| "rewards/ADERawReward/std": 6.07550630569458, | |
| "rewards/StrictFormatReward/mean": 0.4901041708886623, | |
| "rewards/StrictFormatReward/std": 2.2838862776756286, | |
| "sampling/importance_sampling_ratio/max": 2.812648606300354, | |
| "sampling/importance_sampling_ratio/mean": 0.3750308662652969, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.534145450592041, | |
| "sampling/sampling_logp_difference/mean": 0.026898518949747086, | |
| "step": 420, | |
| "step_time": 31.253269567198004 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010416666837409138, | |
| "completions/max_length": 1000.9, | |
| "completions/max_terminated_length": 899.6, | |
| "completions/mean_length": 744.9640930175781, | |
| "completions/mean_terminated_length": 742.0363525390625, | |
| "completions/min_length": 414.5, | |
| "completions/min_terminated_length": 414.5, | |
| "entropy": 0.7489292581876119, | |
| "epoch": 0.07371849819989713, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0073, | |
| "num_tokens": 192599695.0, | |
| "reward": -9.504335880279541, | |
| "reward_std": 5.772246503829956, | |
| "rewards/ADERawReward/mean": -9.548762798309326, | |
| "rewards/ADERawReward/std": 5.758863925933838, | |
| "rewards/StrictFormatReward/mean": 0.4442708283662796, | |
| "rewards/StrictFormatReward/std": 2.37346408367157, | |
| "sampling/importance_sampling_ratio/max": 2.820726418495178, | |
| "sampling/importance_sampling_ratio/mean": 0.36917952001094817, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.74177827835083, | |
| "sampling/sampling_logp_difference/mean": 0.026925336755812167, | |
| "step": 430, | |
| "step_time": 31.05574051309668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.013541666977107524, | |
| "completions/max_length": 1003.0, | |
| "completions/max_terminated_length": 914.5, | |
| "completions/mean_length": 746.8135620117188, | |
| "completions/mean_terminated_length": 743.0150817871094, | |
| "completions/min_length": 349.1, | |
| "completions/min_terminated_length": 349.1, | |
| "entropy": 0.7525838236014049, | |
| "epoch": 0.0754328818789645, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0216, | |
| "num_tokens": 197081145.0, | |
| "reward": -8.942778491973877, | |
| "reward_std": 5.134439134597779, | |
| "rewards/ADERawReward/mean": -8.988924121856689, | |
| "rewards/ADERawReward/std": 5.119956731796265, | |
| "rewards/StrictFormatReward/mean": 0.4614583387970924, | |
| "rewards/StrictFormatReward/std": 2.3441874384880066, | |
| "sampling/importance_sampling_ratio/max": 2.7575156688690186, | |
| "sampling/importance_sampling_ratio/mean": 0.3831501841545105, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 6.117226243019104, | |
| "sampling/sampling_logp_difference/mean": 0.02673897407948971, | |
| "step": 440, | |
| "step_time": 31.136345591007558 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00989583358168602, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 962.1, | |
| "completions/mean_length": 746.3265747070312, | |
| "completions/mean_terminated_length": 743.5626037597656, | |
| "completions/min_length": 502.5, | |
| "completions/min_terminated_length": 502.5, | |
| "entropy": 0.7494863112767537, | |
| "epoch": 0.07714726555803189, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "num_tokens": 201562060.0, | |
| "reward": -8.833820581436157, | |
| "reward_std": 6.585151100158692, | |
| "rewards/ADERawReward/mean": -8.883976650238036, | |
| "rewards/ADERawReward/std": 6.576332855224609, | |
| "rewards/StrictFormatReward/mean": 0.5015625029802322, | |
| "rewards/StrictFormatReward/std": 2.246139848232269, | |
| "sampling/importance_sampling_ratio/max": 2.858155703544617, | |
| "sampling/importance_sampling_ratio/mean": 0.38457230627536776, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.555598568916321, | |
| "sampling/sampling_logp_difference/mean": 0.026770622283220292, | |
| "step": 450, | |
| "step_time": 31.314683586404136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010937500325962902, | |
| "completions/max_length": 1012.7, | |
| "completions/max_terminated_length": 908.7, | |
| "completions/mean_length": 747.1239807128907, | |
| "completions/mean_terminated_length": 744.033935546875, | |
| "completions/min_length": 452.2, | |
| "completions/min_terminated_length": 452.2, | |
| "entropy": 0.7497527619202932, | |
| "epoch": 0.07886164923709926, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0312, | |
| "num_tokens": 206044186.0, | |
| "reward": -8.526823663711548, | |
| "reward_std": 4.98410964012146, | |
| "rewards/ADERawReward/mean": -8.568385744094849, | |
| "rewards/ADERawReward/std": 4.961003923416138, | |
| "rewards/StrictFormatReward/mean": 0.415625, | |
| "rewards/StrictFormatReward/std": 2.4348717212677, | |
| "sampling/importance_sampling_ratio/max": 2.7420751094818114, | |
| "sampling/importance_sampling_ratio/mean": 0.3720589101314545, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.188339471817017, | |
| "sampling/sampling_logp_difference/mean": 0.026370251737535, | |
| "step": 460, | |
| "step_time": 31.211517236700455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010416666837409138, | |
| "completions/max_length": 1013.2, | |
| "completions/max_terminated_length": 913.8, | |
| "completions/mean_length": 747.784912109375, | |
| "completions/mean_terminated_length": 744.8851257324219, | |
| "completions/min_length": 416.8, | |
| "completions/min_terminated_length": 416.8, | |
| "entropy": 0.7524498959382375, | |
| "epoch": 0.08057603291616663, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0088, | |
| "num_tokens": 210527581.0, | |
| "reward": -9.329790830612183, | |
| "reward_std": 5.533093023300171, | |
| "rewards/ADERawReward/mean": -9.378228092193604, | |
| "rewards/ADERawReward/std": 5.5270590543746945, | |
| "rewards/StrictFormatReward/mean": 0.4843750029802322, | |
| "rewards/StrictFormatReward/std": 2.30391206741333, | |
| "sampling/importance_sampling_ratio/max": 2.7844461679458616, | |
| "sampling/importance_sampling_ratio/mean": 0.36996753215789796, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.251289129257202, | |
| "sampling/sampling_logp_difference/mean": 0.026920023374259472, | |
| "step": 470, | |
| "step_time": 31.291990611601797 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008333333535119891, | |
| "completions/max_length": 1006.7, | |
| "completions/max_terminated_length": 924.5, | |
| "completions/mean_length": 744.3693054199218, | |
| "completions/mean_terminated_length": 742.0288696289062, | |
| "completions/min_length": 345.2, | |
| "completions/min_terminated_length": 345.2, | |
| "entropy": 0.7420653978983561, | |
| "epoch": 0.08229041659523402, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0135, | |
| "num_tokens": 215005026.0, | |
| "reward": -8.811880826950073, | |
| "reward_std": 5.741937494277954, | |
| "rewards/ADERawReward/mean": -8.863182735443115, | |
| "rewards/ADERawReward/std": 5.721714019775391, | |
| "rewards/StrictFormatReward/mean": 0.5130208387970925, | |
| "rewards/StrictFormatReward/std": 2.2228100419044496, | |
| "sampling/importance_sampling_ratio/max": 2.887178134918213, | |
| "sampling/importance_sampling_ratio/mean": 0.3799948424100876, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.423047304153442, | |
| "sampling/sampling_logp_difference/mean": 0.026753966324031354, | |
| "step": 480, | |
| "step_time": 31.235763939598108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500232830644, | |
| "completions/max_length": 1007.4, | |
| "completions/max_terminated_length": 921.0, | |
| "completions/mean_length": 745.5041809082031, | |
| "completions/mean_terminated_length": 743.31552734375, | |
| "completions/min_length": 503.1, | |
| "completions/min_terminated_length": 503.1, | |
| "entropy": 0.7548379202683767, | |
| "epoch": 0.08400480027430139, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 219484362.0, | |
| "reward": -8.726992988586426, | |
| "reward_std": 5.2785539627075195, | |
| "rewards/ADERawReward/mean": -8.783451509475707, | |
| "rewards/ADERawReward/std": 5.263514995574951, | |
| "rewards/StrictFormatReward/mean": 0.5645833373069763, | |
| "rewards/StrictFormatReward/std": 2.121866428852081, | |
| "sampling/importance_sampling_ratio/max": 2.8357564210891724, | |
| "sampling/importance_sampling_ratio/mean": 0.36869050562381744, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.648310852050781, | |
| "sampling/sampling_logp_difference/mean": 0.026755044236779213, | |
| "step": 490, | |
| "step_time": 31.2714247025011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010416666930541395, | |
| "completions/max_length": 1011.9, | |
| "completions/max_terminated_length": 899.3, | |
| "completions/mean_length": 745.266162109375, | |
| "completions/mean_terminated_length": 742.3389892578125, | |
| "completions/min_length": 504.1, | |
| "completions/min_terminated_length": 504.1, | |
| "entropy": 0.7439370552698771, | |
| "epoch": 0.08571918395336876, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0111, | |
| "num_tokens": 223962569.0, | |
| "reward": -8.893217849731446, | |
| "reward_std": 6.001188564300537, | |
| "rewards/ADERawReward/mean": -8.93993616104126, | |
| "rewards/ADERawReward/std": 5.9909703731536865, | |
| "rewards/StrictFormatReward/mean": 0.46718750298023226, | |
| "rewards/StrictFormatReward/std": 2.34447557926178, | |
| "sampling/importance_sampling_ratio/max": 2.847609305381775, | |
| "sampling/importance_sampling_ratio/mean": 0.4031797587871552, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.063726282119751, | |
| "sampling/sampling_logp_difference/mean": 0.02628465835005045, | |
| "step": 500, | |
| "step_time": 31.151053833100015 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.008854166837409139, | |
| "completions/max_length": 1009.8, | |
| "completions/max_terminated_length": 904.5, | |
| "completions/mean_length": 745.569287109375, | |
| "completions/mean_terminated_length": 743.0883544921875, | |
| "completions/min_length": 499.8, | |
| "completions/min_terminated_length": 499.8, | |
| "entropy": 0.7393844624360403, | |
| "epoch": 0.08743356763243613, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.003, | |
| "num_tokens": 228442430.0, | |
| "reward": -8.746051263809203, | |
| "reward_std": 5.200268793106079, | |
| "rewards/ADERawReward/mean": -8.796780586242676, | |
| "rewards/ADERawReward/std": 5.189048337936401, | |
| "rewards/StrictFormatReward/mean": 0.5072916716337204, | |
| "rewards/StrictFormatReward/std": 2.2304036617279053, | |
| "sampling/importance_sampling_ratio/max": 2.8343311309814454, | |
| "sampling/importance_sampling_ratio/mean": 0.3753476530313492, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.841508960723877, | |
| "sampling/sampling_logp_difference/mean": 0.026619693823158742, | |
| "step": 510, | |
| "step_time": 31.27263153729873 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007291666837409139, | |
| "completions/max_length": 987.4, | |
| "completions/max_terminated_length": 894.7, | |
| "completions/mean_length": 743.181787109375, | |
| "completions/mean_terminated_length": 741.12255859375, | |
| "completions/min_length": 500.0, | |
| "completions/min_terminated_length": 500.0, | |
| "entropy": 0.7444122533003489, | |
| "epoch": 0.08914795131150352, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 232916635.0, | |
| "reward": -8.511340045928955, | |
| "reward_std": 4.909872698783874, | |
| "rewards/ADERawReward/mean": -8.566652154922485, | |
| "rewards/ADERawReward/std": 4.893595862388611, | |
| "rewards/StrictFormatReward/mean": 0.5531250022351741, | |
| "rewards/StrictFormatReward/std": 2.0853070855140685, | |
| "sampling/importance_sampling_ratio/max": 2.862236833572388, | |
| "sampling/importance_sampling_ratio/mean": 0.38686116933822634, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.322235846519471, | |
| "sampling/sampling_logp_difference/mean": 0.026448269747197627, | |
| "step": 520, | |
| "step_time": 30.78332566640311 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00677083358168602, | |
| "completions/max_length": 991.6, | |
| "completions/max_terminated_length": 919.4, | |
| "completions/mean_length": 747.3963806152344, | |
| "completions/mean_terminated_length": 745.5401245117188, | |
| "completions/min_length": 380.7, | |
| "completions/min_terminated_length": 380.7, | |
| "entropy": 0.7540235598882039, | |
| "epoch": 0.09086233499057089, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0113, | |
| "num_tokens": 237399348.0, | |
| "reward": -8.865944004058838, | |
| "reward_std": 5.564558506011963, | |
| "rewards/ADERawReward/mean": -8.92240219116211, | |
| "rewards/ADERawReward/std": 5.551964378356933, | |
| "rewards/StrictFormatReward/mean": 0.5645833313465118, | |
| "rewards/StrictFormatReward/std": 2.112958538532257, | |
| "sampling/importance_sampling_ratio/max": 2.7960949182510375, | |
| "sampling/importance_sampling_ratio/mean": 0.36649490892887115, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.830978512763977, | |
| "sampling/sampling_logp_difference/mean": 0.02695600502192974, | |
| "step": 530, | |
| "step_time": 30.918909230499413 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000186264515, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 911.9, | |
| "completions/mean_length": 744.5010681152344, | |
| "completions/mean_terminated_length": 742.7408508300781, | |
| "completions/min_length": 418.2, | |
| "completions/min_terminated_length": 418.2, | |
| "entropy": 0.7437257766723633, | |
| "epoch": 0.09257671866963826, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0034, | |
| "num_tokens": 241877222.0, | |
| "reward": -8.705044603347778, | |
| "reward_std": 4.9693234920501705, | |
| "rewards/ADERawReward/mean": -8.75405502319336, | |
| "rewards/ADERawReward/std": 4.961550354957581, | |
| "rewards/StrictFormatReward/mean": 0.490104167163372, | |
| "rewards/StrictFormatReward/std": 2.2909578323364257, | |
| "sampling/importance_sampling_ratio/max": 2.806812071800232, | |
| "sampling/importance_sampling_ratio/mean": 0.38454858362674715, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.238021278381348, | |
| "sampling/sampling_logp_difference/mean": 0.026717523485422133, | |
| "step": 540, | |
| "step_time": 31.30855839920405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.01250000037252903, | |
| "completions/max_length": 1024.0, | |
| "completions/max_terminated_length": 931.3, | |
| "completions/mean_length": 749.8541870117188, | |
| "completions/mean_terminated_length": 746.3853088378906, | |
| "completions/min_length": 535.8, | |
| "completions/min_terminated_length": 535.8, | |
| "entropy": 0.7400518854459127, | |
| "epoch": 0.09429110234870564, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "num_tokens": 246365646.0, | |
| "reward": -8.35037841796875, | |
| "reward_std": 5.094221115112305, | |
| "rewards/ADERawReward/mean": -8.400534439086915, | |
| "rewards/ADERawReward/std": 5.078604912757873, | |
| "rewards/StrictFormatReward/mean": 0.5015625029802322, | |
| "rewards/StrictFormatReward/std": 2.2548654079437256, | |
| "sampling/importance_sampling_ratio/max": 2.729209136962891, | |
| "sampling/importance_sampling_ratio/mean": 0.3943050533533096, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.494288897514343, | |
| "sampling/sampling_logp_difference/mean": 0.026609367132186888, | |
| "step": 550, | |
| "step_time": 31.5098136007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.011979166930541397, | |
| "completions/max_length": 1011.1, | |
| "completions/max_terminated_length": 901.4, | |
| "completions/mean_length": 746.1073059082031, | |
| "completions/mean_terminated_length": 742.7442687988281, | |
| "completions/min_length": 385.7, | |
| "completions/min_terminated_length": 385.7, | |
| "entropy": 0.7511671622594197, | |
| "epoch": 0.09600548602777302, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0039, | |
| "num_tokens": 250846300.0, | |
| "reward": -8.354753160476685, | |
| "reward_std": 4.998448300361633, | |
| "rewards/ADERawReward/mean": -8.403190279006958, | |
| "rewards/ADERawReward/std": 4.983683443069458, | |
| "rewards/StrictFormatReward/mean": 0.484375, | |
| "rewards/StrictFormatReward/std": 2.294073450565338, | |
| "sampling/importance_sampling_ratio/max": 2.7630903482437135, | |
| "sampling/importance_sampling_ratio/mean": 0.371540492773056, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.54563274383545, | |
| "sampling/sampling_logp_difference/mean": 0.026747452840209008, | |
| "step": 560, | |
| "step_time": 31.339443079804187 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006770833395421505, | |
| "completions/max_length": 1006.3, | |
| "completions/max_terminated_length": 919.6, | |
| "completions/mean_length": 746.6364807128906, | |
| "completions/mean_terminated_length": 744.7229919433594, | |
| "completions/min_length": 485.7, | |
| "completions/min_terminated_length": 485.7, | |
| "entropy": 0.7559505820274353, | |
| "epoch": 0.09771986970684039, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.022, | |
| "num_tokens": 255327154.0, | |
| "reward": -9.671938514709472, | |
| "reward_std": 6.359064817428589, | |
| "rewards/ADERawReward/mean": -9.72152156829834, | |
| "rewards/ADERawReward/std": 6.352941226959229, | |
| "rewards/StrictFormatReward/mean": 0.49583333283662795, | |
| "rewards/StrictFormatReward/std": 2.2696222186088564, | |
| "sampling/importance_sampling_ratio/max": 2.8152915716171263, | |
| "sampling/importance_sampling_ratio/mean": 0.37120532989501953, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.6147093534469605, | |
| "sampling/sampling_logp_difference/mean": 0.027104974165558814, | |
| "step": 570, | |
| "step_time": 31.2074279195047 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.009375000093132257, | |
| "completions/max_length": 989.5, | |
| "completions/max_terminated_length": 893.7, | |
| "completions/mean_length": 747.6479309082031, | |
| "completions/mean_terminated_length": 745.0316833496094, | |
| "completions/min_length": 484.2, | |
| "completions/min_terminated_length": 484.2, | |
| "entropy": 0.7506202618281047, | |
| "epoch": 0.09943425338590776, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 259810670.0, | |
| "reward": -8.481088829040527, | |
| "reward_std": 5.200103354454041, | |
| "rewards/ADERawReward/mean": -8.532390689849853, | |
| "rewards/ADERawReward/std": 5.18415892124176, | |
| "rewards/StrictFormatReward/mean": 0.5130208402872085, | |
| "rewards/StrictFormatReward/std": 2.2208899974823, | |
| "sampling/importance_sampling_ratio/max": 2.803302502632141, | |
| "sampling/importance_sampling_ratio/mean": 0.4020031362771988, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 5.206298041343689, | |
| "sampling/sampling_logp_difference/mean": 0.02699448149651289, | |
| "step": 580, | |
| "step_time": 31.03782037871715 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500093132257, | |
| "completions/max_length": 998.4, | |
| "completions/max_terminated_length": 910.7, | |
| "completions/mean_length": 747.5250183105469, | |
| "completions/mean_terminated_length": 745.33720703125, | |
| "completions/min_length": 399.1, | |
| "completions/min_terminated_length": 399.1, | |
| "entropy": 0.7567769944667816, | |
| "epoch": 0.10114863706497514, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0121, | |
| "num_tokens": 264294318.0, | |
| "reward": -8.613714408874511, | |
| "reward_std": 5.104084444046021, | |
| "rewards/ADERawReward/mean": -8.672464275360108, | |
| "rewards/ADERawReward/std": 5.086045837402343, | |
| "rewards/StrictFormatReward/mean": 0.5874999970197677, | |
| "rewards/StrictFormatReward/std": 2.0732890486717226, | |
| "sampling/importance_sampling_ratio/max": 2.7467212677001953, | |
| "sampling/importance_sampling_ratio/mean": 0.36461472511291504, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.379195928573608, | |
| "sampling/sampling_logp_difference/mean": 0.027049308083951472, | |
| "step": 590, | |
| "step_time": 31.140472374291857 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0062500000931322575, | |
| "completions/max_length": 992.1, | |
| "completions/max_terminated_length": 916.2, | |
| "completions/mean_length": 745.9307434082032, | |
| "completions/mean_terminated_length": 744.1726928710938, | |
| "completions/min_length": 395.7, | |
| "completions/min_terminated_length": 395.7, | |
| "entropy": 0.7494464596112569, | |
| "epoch": 0.10286302074404252, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 268774665.0, | |
| "reward": -8.773472213745118, | |
| "reward_std": 5.6750462532043455, | |
| "rewards/ADERawReward/mean": -8.832221984863281, | |
| "rewards/ADERawReward/std": 5.654839897155762, | |
| "rewards/StrictFormatReward/mean": 0.5875000059604645, | |
| "rewards/StrictFormatReward/std": 2.0246779561042785, | |
| "sampling/importance_sampling_ratio/max": 2.8275955438613893, | |
| "sampling/importance_sampling_ratio/mean": 0.3867086052894592, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.084816575050354, | |
| "sampling/sampling_logp_difference/mean": 0.027156472206115723, | |
| "step": 600, | |
| "step_time": 31.06699350780109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0057291668374091385, | |
| "completions/max_length": 1017.1, | |
| "completions/max_terminated_length": 910.3, | |
| "completions/mean_length": 747.0932495117188, | |
| "completions/mean_terminated_length": 745.496240234375, | |
| "completions/min_length": 485.1, | |
| "completions/min_terminated_length": 485.1, | |
| "entropy": 0.7526243805885315, | |
| "epoch": 0.1045774044231099, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0151, | |
| "num_tokens": 273256972.0, | |
| "reward": -8.731144094467163, | |
| "reward_std": 5.71333794593811, | |
| "rewards/ADERawReward/mean": -8.782446098327636, | |
| "rewards/ADERawReward/std": 5.699142122268677, | |
| "rewards/StrictFormatReward/mean": 0.5130208373069763, | |
| "rewards/StrictFormatReward/std": 2.253889012336731, | |
| "sampling/importance_sampling_ratio/max": 2.8028954029083253, | |
| "sampling/importance_sampling_ratio/mean": 0.3589755445718765, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.905529975891113, | |
| "sampling/sampling_logp_difference/mean": 0.02720630671828985, | |
| "step": 610, | |
| "step_time": 31.30127082870167 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007291666837409139, | |
| "completions/max_length": 986.2, | |
| "completions/max_terminated_length": 908.3, | |
| "completions/mean_length": 746.2526245117188, | |
| "completions/mean_terminated_length": 744.2300537109375, | |
| "completions/min_length": 523.4, | |
| "completions/min_terminated_length": 523.4, | |
| "entropy": 0.7530896683533986, | |
| "epoch": 0.10629178810217726, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0115, | |
| "num_tokens": 277737025.0, | |
| "reward": -8.564061069488526, | |
| "reward_std": 4.777447652816773, | |
| "rewards/ADERawReward/mean": -8.621665000915527, | |
| "rewards/ADERawReward/std": 4.770789432525635, | |
| "rewards/StrictFormatReward/mean": 0.5760416656732559, | |
| "rewards/StrictFormatReward/std": 2.0861072897911073, | |
| "sampling/importance_sampling_ratio/max": 2.810882544517517, | |
| "sampling/importance_sampling_ratio/mean": 0.39588436484336853, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.1026554822921755, | |
| "sampling/sampling_logp_difference/mean": 0.02695068046450615, | |
| "step": 620, | |
| "step_time": 30.90586839320604 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007291666837409139, | |
| "completions/max_length": 988.7, | |
| "completions/max_terminated_length": 898.0, | |
| "completions/mean_length": 748.3562683105469, | |
| "completions/mean_terminated_length": 746.3339233398438, | |
| "completions/min_length": 629.4, | |
| "completions/min_terminated_length": 629.4, | |
| "entropy": 0.7435857594013214, | |
| "epoch": 0.10800617178124464, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0193, | |
| "num_tokens": 282222253.0, | |
| "reward": -9.023520565032959, | |
| "reward_std": 5.119320058822632, | |
| "rewards/ADERawReward/mean": -9.07825984954834, | |
| "rewards/ADERawReward/std": 5.104773378372192, | |
| "rewards/StrictFormatReward/mean": 0.5473958343267441, | |
| "rewards/StrictFormatReward/std": 2.160345029830933, | |
| "sampling/importance_sampling_ratio/max": 2.8361830711364746, | |
| "sampling/importance_sampling_ratio/mean": 0.37177495658397675, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.693781447410584, | |
| "sampling/sampling_logp_difference/mean": 0.027231728471815587, | |
| "step": 630, | |
| "step_time": 30.870413991407258 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.004687500139698386, | |
| "completions/max_length": 971.6, | |
| "completions/max_terminated_length": 889.4, | |
| "completions/mean_length": 747.0187744140625, | |
| "completions/mean_terminated_length": 745.7044677734375, | |
| "completions/min_length": 558.4, | |
| "completions/min_terminated_length": 558.4, | |
| "entropy": 0.7533182700475057, | |
| "epoch": 0.10972055546031202, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 286704369.0, | |
| "reward": -9.217678833007813, | |
| "reward_std": 5.054318857192993, | |
| "rewards/ADERawReward/mean": -9.277001667022706, | |
| "rewards/ADERawReward/std": 5.038924503326416, | |
| "rewards/StrictFormatReward/mean": 0.5932291626930237, | |
| "rewards/StrictFormatReward/std": 2.0508111119270325, | |
| "sampling/importance_sampling_ratio/max": 2.866812252998352, | |
| "sampling/importance_sampling_ratio/mean": 0.3646788477897644, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.960925984382629, | |
| "sampling/sampling_logp_difference/mean": 0.027474281564354896, | |
| "step": 640, | |
| "step_time": 30.600575475511143 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.003645833395421505, | |
| "completions/max_length": 960.3, | |
| "completions/max_terminated_length": 891.2, | |
| "completions/mean_length": 742.1609497070312, | |
| "completions/mean_terminated_length": 741.1292663574219, | |
| "completions/min_length": 366.8, | |
| "completions/min_terminated_length": 366.8, | |
| "entropy": 0.7416558722654979, | |
| "epoch": 0.1114349391393794, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0118, | |
| "num_tokens": 291177734.0, | |
| "reward": -8.407093048095703, | |
| "reward_std": 5.326395010948181, | |
| "rewards/ADERawReward/mean": -8.467561674118041, | |
| "rewards/ADERawReward/std": 5.315518450737, | |
| "rewards/StrictFormatReward/mean": 0.6046874970197678, | |
| "rewards/StrictFormatReward/std": 2.026141095161438, | |
| "sampling/importance_sampling_ratio/max": 2.862113666534424, | |
| "sampling/importance_sampling_ratio/mean": 0.3777022361755371, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.170439219474792, | |
| "sampling/sampling_logp_difference/mean": 0.027010084502398966, | |
| "step": 650, | |
| "step_time": 30.520393895095914 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.010937500279396772, | |
| "completions/max_length": 1009.0, | |
| "completions/max_terminated_length": 908.3, | |
| "completions/mean_length": 746.7343872070312, | |
| "completions/mean_terminated_length": 743.6657897949219, | |
| "completions/min_length": 455.1, | |
| "completions/min_terminated_length": 455.1, | |
| "entropy": 0.748453684647878, | |
| "epoch": 0.11314932281844677, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0029, | |
| "num_tokens": 295659304.0, | |
| "reward": -8.47295446395874, | |
| "reward_std": 5.315748786926269, | |
| "rewards/ADERawReward/mean": -8.525974988937378, | |
| "rewards/ADERawReward/std": 5.30073721408844, | |
| "rewards/StrictFormatReward/mean": 0.5302083283662796, | |
| "rewards/StrictFormatReward/std": 2.2150806427001952, | |
| "sampling/importance_sampling_ratio/max": 2.8308970451354982, | |
| "sampling/importance_sampling_ratio/mean": 0.36592633128166197, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.81339545249939, | |
| "sampling/sampling_logp_difference/mean": 0.027280481532216072, | |
| "step": 660, | |
| "step_time": 31.34898143170285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0031250000931322573, | |
| "completions/max_length": 961.7, | |
| "completions/max_terminated_length": 898.4, | |
| "completions/mean_length": 745.1661743164062, | |
| "completions/mean_terminated_length": 744.2929931640625, | |
| "completions/min_length": 464.4, | |
| "completions/min_terminated_length": 464.4, | |
| "entropy": 0.7453262567520141, | |
| "epoch": 0.11486370649751414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 300137783.0, | |
| "reward": -8.66427845954895, | |
| "reward_std": 4.92835762500763, | |
| "rewards/ADERawReward/mean": -8.727611589431763, | |
| "rewards/ADERawReward/std": 4.9187664270401, | |
| "rewards/StrictFormatReward/mean": 0.6333333373069763, | |
| "rewards/StrictFormatReward/std": 1.9716094613075257, | |
| "sampling/importance_sampling_ratio/max": 2.7208317041397097, | |
| "sampling/importance_sampling_ratio/mean": 0.3644146382808685, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.467776584625244, | |
| "sampling/sampling_logp_difference/mean": 0.027055931463837623, | |
| "step": 670, | |
| "step_time": 30.402021049396716 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500139698387, | |
| "completions/max_length": 1009.0, | |
| "completions/max_terminated_length": 900.7, | |
| "completions/mean_length": 744.5047119140625, | |
| "completions/mean_terminated_length": 742.3053039550781, | |
| "completions/min_length": 501.0, | |
| "completions/min_terminated_length": 501.0, | |
| "entropy": 0.7389722168445587, | |
| "epoch": 0.11657809017658152, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 304615648.0, | |
| "reward": -8.14780044555664, | |
| "reward_std": 5.765257406234741, | |
| "rewards/ADERawReward/mean": -8.198529577255249, | |
| "rewards/ADERawReward/std": 5.752817440032959, | |
| "rewards/StrictFormatReward/mean": 0.507291667163372, | |
| "rewards/StrictFormatReward/std": 2.2412220239639282, | |
| "sampling/importance_sampling_ratio/max": 2.838437080383301, | |
| "sampling/importance_sampling_ratio/mean": 0.3927165180444717, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.8545892953872682, | |
| "sampling/sampling_logp_difference/mean": 0.026737906597554685, | |
| "step": 680, | |
| "step_time": 31.265542278197245 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007291666837409139, | |
| "completions/max_length": 1011.0, | |
| "completions/max_terminated_length": 918.1, | |
| "completions/mean_length": 745.039599609375, | |
| "completions/mean_terminated_length": 743.0080322265625, | |
| "completions/min_length": 483.0, | |
| "completions/min_terminated_length": 483.0, | |
| "entropy": 0.7438246866067251, | |
| "epoch": 0.1182924738556489, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0139, | |
| "num_tokens": 309093692.0, | |
| "reward": -8.50984206199646, | |
| "reward_std": 4.543536067008972, | |
| "rewards/ADERawReward/mean": -8.562862634658813, | |
| "rewards/ADERawReward/std": 4.5306751251220705, | |
| "rewards/StrictFormatReward/mean": 0.5302083313465118, | |
| "rewards/StrictFormatReward/std": 2.1870830774307253, | |
| "sampling/importance_sampling_ratio/max": 2.8570920705795286, | |
| "sampling/importance_sampling_ratio/mean": 0.3624374896287918, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.160294127464295, | |
| "sampling/sampling_logp_difference/mean": 0.02724489979445934, | |
| "step": 690, | |
| "step_time": 31.26219962689502 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.005208333441987634, | |
| "completions/max_length": 996.0, | |
| "completions/max_terminated_length": 924.7, | |
| "completions/mean_length": 745.0484558105469, | |
| "completions/mean_terminated_length": 743.5991821289062, | |
| "completions/min_length": 539.6, | |
| "completions/min_terminated_length": 539.6, | |
| "entropy": 0.7445198019345601, | |
| "epoch": 0.12000685753471627, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0064, | |
| "num_tokens": 313571945.0, | |
| "reward": -8.856493520736695, | |
| "reward_std": 4.863119053840637, | |
| "rewards/ADERawReward/mean": -8.914670515060426, | |
| "rewards/ADERawReward/std": 4.857007074356079, | |
| "rewards/StrictFormatReward/mean": 0.5817708313465119, | |
| "rewards/StrictFormatReward/std": 2.0837946057319643, | |
| "sampling/importance_sampling_ratio/max": 2.8482714176177977, | |
| "sampling/importance_sampling_ratio/mean": 0.3727655470371246, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.315770363807678, | |
| "sampling/sampling_logp_difference/mean": 0.02694130353629589, | |
| "step": 700, | |
| "step_time": 31.00740701830364 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00885416679084301, | |
| "completions/max_length": 980.0, | |
| "completions/max_terminated_length": 904.4, | |
| "completions/mean_length": 745.5870056152344, | |
| "completions/mean_terminated_length": 743.1242919921875, | |
| "completions/min_length": 423.9, | |
| "completions/min_terminated_length": 423.9, | |
| "entropy": 0.7471017857392629, | |
| "epoch": 0.12172124121378365, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0164, | |
| "num_tokens": 318051584.0, | |
| "reward": -8.626063299179076, | |
| "reward_std": 5.039473915100098, | |
| "rewards/ADERawReward/mean": -8.679083728790284, | |
| "rewards/ADERawReward/std": 5.022432565689087, | |
| "rewards/StrictFormatReward/mean": 0.5302083313465118, | |
| "rewards/StrictFormatReward/std": 2.2059712767601014, | |
| "sampling/importance_sampling_ratio/max": 2.6881606578826904, | |
| "sampling/importance_sampling_ratio/mean": 0.35196583569049833, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.9412872314453127, | |
| "sampling/sampling_logp_difference/mean": 0.02722361944615841, | |
| "step": 710, | |
| "step_time": 30.79243855099194 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.007812500093132257, | |
| "completions/max_length": 1011.8, | |
| "completions/max_terminated_length": 889.3, | |
| "completions/mean_length": 745.9838806152344, | |
| "completions/mean_terminated_length": 743.8016540527344, | |
| "completions/min_length": 430.3, | |
| "completions/min_terminated_length": 430.3, | |
| "entropy": 0.737281721830368, | |
| "epoch": 0.12343562489285102, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0019, | |
| "num_tokens": 322532273.0, | |
| "reward": -8.676169490814209, | |
| "reward_std": 6.3611784934997555, | |
| "rewards/ADERawReward/mean": -8.736637830734253, | |
| "rewards/ADERawReward/std": 6.346572685241699, | |
| "rewards/StrictFormatReward/mean": 0.6046874940395355, | |
| "rewards/StrictFormatReward/std": 2.027357840538025, | |
| "sampling/importance_sampling_ratio/max": 2.7995285272598265, | |
| "sampling/importance_sampling_ratio/mean": 0.35478300750255587, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.93809871673584, | |
| "sampling/sampling_logp_difference/mean": 0.027037655375897883, | |
| "step": 720, | |
| "step_time": 31.32133076491009 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.006250000139698386, | |
| "completions/max_length": 1008.3, | |
| "completions/max_terminated_length": 902.0, | |
| "completions/mean_length": 744.9453369140625, | |
| "completions/mean_terminated_length": 743.1854309082031, | |
| "completions/min_length": 398.3, | |
| "completions/min_terminated_length": 398.3, | |
| "entropy": 0.7388640781243642, | |
| "epoch": 0.1251500085719184, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0171, | |
| "num_tokens": 327010056.0, | |
| "reward": -8.387274026870728, | |
| "reward_std": 4.668637943267822, | |
| "rewards/ADERawReward/mean": -8.443159294128417, | |
| "rewards/ADERawReward/std": 4.659743785858154, | |
| "rewards/StrictFormatReward/mean": 0.5588541686534881, | |
| "rewards/StrictFormatReward/std": 2.136683487892151, | |
| "sampling/importance_sampling_ratio/max": 2.7782902002334593, | |
| "sampling/importance_sampling_ratio/mean": 0.3791107714176178, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.789157485961914, | |
| "sampling/sampling_logp_difference/mean": 0.02694005910307169, | |
| "step": 730, | |
| "step_time": 31.208155655500015 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 730, | |
| "num_input_tokens_seen": 327010056, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |