Instructions to use Gege24/test_liars_dice with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/test_liars_dice with PEFT:
Base model is not found.
- Transformers
How to use Gege24/test_liars_dice with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/test_liars_dice") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Gege24/test_liars_dice", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gege24/test_liars_dice with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/test_liars_dice" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/test_liars_dice", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/test_liars_dice
- SGLang
How to use Gege24/test_liars_dice with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/test_liars_dice" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/test_liars_dice", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/test_liars_dice" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/test_liars_dice", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/test_liars_dice with Docker Model Runner:
docker model run hf.co/Gege24/test_liars_dice
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.0006, | |
| "eval_steps": 500, | |
| "global_step": 60, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.54889988899231, | |
| "epoch": 1e-05, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.03602602332830429, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0011, | |
| "num_tokens": 37068.0, | |
| "reward": -0.20241966843605042, | |
| "reward_std": 0.48188072443008423, | |
| "rewards/rollout_reward_func/mean": -0.20241966843605042, | |
| "rewards/rollout_reward_func/std": 0.5643806457519531, | |
| "sampling/importance_sampling_ratio/max": 0.07946816831827164, | |
| "sampling/importance_sampling_ratio/mean": 0.04483851045370102, | |
| "sampling/importance_sampling_ratio/min": 0.013508557341992855, | |
| "sampling/sampling_logp_difference/max": 2.2406487464904785, | |
| "sampling/sampling_logp_difference/mean": 1.6288080215454102, | |
| "step": 1, | |
| "step_time": 5.987766452998585 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.54889988899231, | |
| "epoch": 2e-05, | |
| "grad_norm": 0.03546445816755295, | |
| "kl": 0.0, | |
| "learning_rate": 2.8571428571428575e-07, | |
| "loss": -0.0011, | |
| "step": 2, | |
| "step_time": 2.7118166719992587 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.612898111343384, | |
| "epoch": 3e-05, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.028961816802620888, | |
| "kl": 0.0006349515169858932, | |
| "learning_rate": 5.714285714285715e-07, | |
| "loss": -0.0011, | |
| "num_tokens": 73706.0, | |
| "reward": -0.41709062457084656, | |
| "reward_std": 0.38032135367393494, | |
| "rewards/rollout_reward_func/mean": -0.41709062457084656, | |
| "rewards/rollout_reward_func/std": 0.5864750146865845, | |
| "sampling/importance_sampling_ratio/max": 0.07319076359272003, | |
| "sampling/importance_sampling_ratio/mean": 0.04485338553786278, | |
| "sampling/importance_sampling_ratio/min": 0.010412708856165409, | |
| "sampling/sampling_logp_difference/max": 2.62746524810791, | |
| "sampling/sampling_logp_difference/mean": 1.631974458694458, | |
| "step": 3, | |
| "step_time": 5.237267270999837 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.621538519859314, | |
| "epoch": 4e-05, | |
| "grad_norm": 0.027420829981565475, | |
| "kl": 0.0009405575692653656, | |
| "learning_rate": 8.571428571428572e-07, | |
| "loss": -0.0011, | |
| "step": 4, | |
| "step_time": 2.7475997250003275 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 5.0, | |
| "completions/mean_length": 2.53125, | |
| "completions/mean_terminated_length": 2.096774101257324, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.608618140220642, | |
| "epoch": 5e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04249607026576996, | |
| "kl": 0.0009066914208233356, | |
| "learning_rate": 1.142857142857143e-06, | |
| "loss": -0.0019, | |
| "num_tokens": 108621.0, | |
| "reward": -0.4377398192882538, | |
| "reward_std": 0.6512277126312256, | |
| "rewards/rollout_reward_func/mean": -0.4377398192882538, | |
| "rewards/rollout_reward_func/std": 0.8918678760528564, | |
| "sampling/importance_sampling_ratio/max": 0.0737227350473404, | |
| "sampling/importance_sampling_ratio/mean": 0.03858622536063194, | |
| "sampling/importance_sampling_ratio/min": 1.0664673354490728e-12, | |
| "sampling/sampling_logp_difference/max": 4.4544358253479, | |
| "sampling/sampling_logp_difference/mean": 1.7643263339996338, | |
| "step": 5, | |
| "step_time": 6.21450720699886 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.600322604179382, | |
| "epoch": 6e-05, | |
| "grad_norm": 0.04186399653553963, | |
| "kl": 0.0009581162594258785, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": -0.002, | |
| "step": 6, | |
| "step_time": 2.7577716289997625 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0069444444961845875, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0069444444961845875, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7.0, | |
| "completions/max_terminated_length": 7.0, | |
| "completions/mean_length": 2.15625, | |
| "completions/mean_terminated_length": 2.15625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.887233972549438, | |
| "epoch": 7e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01845015399158001, | |
| "kl": 0.001277084978937637, | |
| "learning_rate": 1.7142857142857145e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 142642.0, | |
| "reward": -0.24416625499725342, | |
| "reward_std": 0.7532411813735962, | |
| "rewards/rollout_reward_func/mean": -0.24416625499725342, | |
| "rewards/rollout_reward_func/std": 0.8073007464408875, | |
| "sampling/importance_sampling_ratio/max": 0.05460228770971298, | |
| "sampling/importance_sampling_ratio/mean": 0.028035324066877365, | |
| "sampling/importance_sampling_ratio/min": 1.4402675105884555e-06, | |
| "sampling/sampling_logp_difference/max": 3.2391209602355957, | |
| "sampling/sampling_logp_difference/mean": 1.8511791229248047, | |
| "step": 7, | |
| "step_time": 5.192916050999884 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02777777798473835, | |
| "clip_ratio/high_mean": 0.013888888992369175, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.013888888992369175, | |
| "entropy": 8.885694026947021, | |
| "epoch": 8e-05, | |
| "grad_norm": 0.018649807199835777, | |
| "kl": 0.0008755421440582722, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": -0.0001, | |
| "step": 8, | |
| "step_time": 2.734651068999483 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.4375, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.617319464683533, | |
| "epoch": 9e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03367479518055916, | |
| "kl": 0.0011472837650217116, | |
| "learning_rate": 2.285714285714286e-06, | |
| "loss": -0.0017, | |
| "num_tokens": 180338.0, | |
| "reward": -0.2762250006198883, | |
| "reward_std": 0.6308639645576477, | |
| "rewards/rollout_reward_func/mean": -0.2762250006198883, | |
| "rewards/rollout_reward_func/std": 0.619946300983429, | |
| "sampling/importance_sampling_ratio/max": 0.08318282663822174, | |
| "sampling/importance_sampling_ratio/mean": 0.04022577404975891, | |
| "sampling/importance_sampling_ratio/min": 8.560036707239149e-11, | |
| "sampling/sampling_logp_difference/max": 2.8513259887695312, | |
| "sampling/sampling_logp_difference/mean": 1.6676621437072754, | |
| "step": 9, | |
| "step_time": 5.192849637998734 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.604341268539429, | |
| "epoch": 0.0001, | |
| "grad_norm": 0.03420183062553406, | |
| "kl": 0.001177078731416259, | |
| "learning_rate": 2.571428571428571e-06, | |
| "loss": -0.0017, | |
| "step": 10, | |
| "step_time": 2.779620520998833 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.559729814529419, | |
| "epoch": 0.00011, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02679450251162052, | |
| "kl": 0.0014951191842556, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": -0.0013, | |
| "num_tokens": 215884.0, | |
| "reward": 0.05839650332927704, | |
| "reward_std": 0.5475454926490784, | |
| "rewards/rollout_reward_func/mean": 0.05839650332927704, | |
| "rewards/rollout_reward_func/std": 0.7081521153450012, | |
| "sampling/importance_sampling_ratio/max": 0.09729397296905518, | |
| "sampling/importance_sampling_ratio/mean": 0.04614107310771942, | |
| "sampling/importance_sampling_ratio/min": 0.009668753482401371, | |
| "sampling/sampling_logp_difference/max": 2.5881471633911133, | |
| "sampling/sampling_logp_difference/mean": 1.6770999431610107, | |
| "step": 11, | |
| "step_time": 6.151045407999845 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.533240914344788, | |
| "epoch": 0.00012, | |
| "grad_norm": 0.027015963569283485, | |
| "kl": 0.002467602491378784, | |
| "learning_rate": 3.142857142857143e-06, | |
| "loss": -0.0013, | |
| "step": 12, | |
| "step_time": 2.712993424999695 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5.0, | |
| "completions/max_terminated_length": 5.0, | |
| "completions/mean_length": 2.09375, | |
| "completions/mean_terminated_length": 2.09375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.65865409374237, | |
| "epoch": 0.00013, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03438998758792877, | |
| "kl": 0.0025057614548131824, | |
| "learning_rate": 3.428571428571429e-06, | |
| "loss": -0.0012, | |
| "num_tokens": 250983.0, | |
| "reward": -0.3477444052696228, | |
| "reward_std": 0.8011652231216431, | |
| "rewards/rollout_reward_func/mean": -0.3477444052696228, | |
| "rewards/rollout_reward_func/std": 0.8506107926368713, | |
| "sampling/importance_sampling_ratio/max": 0.08270177990198135, | |
| "sampling/importance_sampling_ratio/mean": 0.03867126256227493, | |
| "sampling/importance_sampling_ratio/min": 2.151069793399074e-06, | |
| "sampling/sampling_logp_difference/max": 4.863964557647705, | |
| "sampling/sampling_logp_difference/mean": 1.7552906274795532, | |
| "step": 13, | |
| "step_time": 5.348588517999815 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.637501120567322, | |
| "epoch": 0.00014, | |
| "grad_norm": 0.03334236517548561, | |
| "kl": 0.003458394785411656, | |
| "learning_rate": 3.7142857142857146e-06, | |
| "loss": -0.0013, | |
| "step": 14, | |
| "step_time": 2.796233564999966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8.0, | |
| "completions/max_terminated_length": 8.0, | |
| "completions/mean_length": 2.1875, | |
| "completions/mean_terminated_length": 2.1875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.605337858200073, | |
| "epoch": 0.00015, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0344076044857502, | |
| "kl": 0.007246186607517302, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": -0.0011, | |
| "num_tokens": 287257.0, | |
| "reward": -0.5175027251243591, | |
| "reward_std": 0.7213335633277893, | |
| "rewards/rollout_reward_func/mean": -0.5175027251243591, | |
| "rewards/rollout_reward_func/std": 0.7736045718193054, | |
| "sampling/importance_sampling_ratio/max": 0.07941123098134995, | |
| "sampling/importance_sampling_ratio/mean": 0.03353440761566162, | |
| "sampling/importance_sampling_ratio/min": 8.447619620710611e-05, | |
| "sampling/sampling_logp_difference/max": 2.514495372772217, | |
| "sampling/sampling_logp_difference/mean": 1.715531349182129, | |
| "step": 15, | |
| "step_time": 5.0371513690006395 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.572421252727509, | |
| "epoch": 0.00016, | |
| "grad_norm": 0.0361904576420784, | |
| "kl": 0.010158459888771176, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": -0.0012, | |
| "step": 16, | |
| "step_time": 3.6766240449996985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0052083334885537624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 10.0, | |
| "completions/mean_length": 2.6875, | |
| "completions/mean_terminated_length": 2.2580645084381104, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.298084318637848, | |
| "epoch": 0.00017, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03764951229095459, | |
| "kl": 0.01280841525294818, | |
| "learning_rate": 4.571428571428572e-06, | |
| "loss": -0.0025, | |
| "num_tokens": 319698.0, | |
| "reward": -0.4354173243045807, | |
| "reward_std": 0.6479126214981079, | |
| "rewards/rollout_reward_func/mean": -0.4354173243045807, | |
| "rewards/rollout_reward_func/std": 0.8862241506576538, | |
| "sampling/importance_sampling_ratio/max": 0.11163407564163208, | |
| "sampling/importance_sampling_ratio/mean": 0.04720360040664673, | |
| "sampling/importance_sampling_ratio/min": 5.244973292489741e-12, | |
| "sampling/sampling_logp_difference/max": 4.713146686553955, | |
| "sampling/sampling_logp_difference/mean": 1.6690685749053955, | |
| "step": 17, | |
| "step_time": 5.1438336040009744 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0052083334885537624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "entropy": 8.210463047027588, | |
| "epoch": 0.00018, | |
| "grad_norm": 0.03806741535663605, | |
| "kl": 0.01803363612270914, | |
| "learning_rate": 4.857142857142858e-06, | |
| "loss": -0.0026, | |
| "step": 18, | |
| "step_time": 2.7552450699986366 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.558752536773682, | |
| "epoch": 0.00019, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04838458448648453, | |
| "kl": 0.022023072466254234, | |
| "learning_rate": 5.142857142857142e-06, | |
| "loss": -0.0019, | |
| "num_tokens": 351318.0, | |
| "reward": -0.6331132054328918, | |
| "reward_std": 0.8535451292991638, | |
| "rewards/rollout_reward_func/mean": -0.6331132054328918, | |
| "rewards/rollout_reward_func/std": 0.9285341501235962, | |
| "sampling/importance_sampling_ratio/max": 0.11876487731933594, | |
| "sampling/importance_sampling_ratio/mean": 0.04305056482553482, | |
| "sampling/importance_sampling_ratio/min": 0.01192709431052208, | |
| "sampling/sampling_logp_difference/max": 2.4490432739257812, | |
| "sampling/sampling_logp_difference/mean": 1.6915147304534912, | |
| "step": 19, | |
| "step_time": 4.906003911999505 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 8.480961680412292, | |
| "epoch": 0.0002, | |
| "grad_norm": 0.047831419855356216, | |
| "kl": 0.03148091956973076, | |
| "learning_rate": 5.428571428571429e-06, | |
| "loss": -0.0021, | |
| "step": 20, | |
| "step_time": 2.542041312000947 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.914846777915955, | |
| "epoch": 0.00021, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0461098738014698, | |
| "kl": 0.05386248789727688, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": -0.004, | |
| "num_tokens": 382194.0, | |
| "reward": 0.12508951127529144, | |
| "reward_std": 1.0255252122879028, | |
| "rewards/rollout_reward_func/mean": 0.12508951127529144, | |
| "rewards/rollout_reward_func/std": 1.0441889762878418, | |
| "sampling/importance_sampling_ratio/max": 0.15263937413692474, | |
| "sampling/importance_sampling_ratio/mean": 0.07228268682956696, | |
| "sampling/importance_sampling_ratio/min": 0.011906024999916553, | |
| "sampling/sampling_logp_difference/max": 2.5268516540527344, | |
| "sampling/sampling_logp_difference/mean": 1.4949290752410889, | |
| "step": 21, | |
| "step_time": 5.119830969998475 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.746252477169037, | |
| "epoch": 0.00022, | |
| "grad_norm": 0.04356187954545021, | |
| "kl": 0.06908445432782173, | |
| "learning_rate": 6e-06, | |
| "loss": -0.0043, | |
| "step": 22, | |
| "step_time": 3.6642874440012747 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.505157113075256, | |
| "epoch": 0.00023, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027521340176463127, | |
| "kl": 0.11650293320417404, | |
| "learning_rate": 6.285714285714286e-06, | |
| "loss": -0.0037, | |
| "num_tokens": 418193.0, | |
| "reward": 0.050230324268341064, | |
| "reward_std": 0.5550512671470642, | |
| "rewards/rollout_reward_func/mean": 0.050230324268341064, | |
| "rewards/rollout_reward_func/std": 0.7158081531524658, | |
| "sampling/importance_sampling_ratio/max": 0.17737992107868195, | |
| "sampling/importance_sampling_ratio/mean": 0.08728897571563721, | |
| "sampling/importance_sampling_ratio/min": 0.01220005378127098, | |
| "sampling/sampling_logp_difference/max": 2.312082290649414, | |
| "sampling/sampling_logp_difference/mean": 1.4727132320404053, | |
| "step": 23, | |
| "step_time": 5.118901103000098 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.03125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.046875, | |
| "entropy": 7.405115187168121, | |
| "epoch": 0.00024, | |
| "grad_norm": 0.027487488463521004, | |
| "kl": 0.18349953554570675, | |
| "learning_rate": 6.571428571428572e-06, | |
| "loss": -0.0038, | |
| "step": 24, | |
| "step_time": 2.720049919002122 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.015625, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 14.0, | |
| "completions/mean_length": 2.8125, | |
| "completions/mean_terminated_length": 2.387096643447876, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.812176287174225, | |
| "epoch": 0.00025, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04480979964137077, | |
| "kl": 0.12202088022604585, | |
| "learning_rate": 6.857142857142858e-06, | |
| "loss": -0.003, | |
| "num_tokens": 453429.0, | |
| "reward": -0.5567976832389832, | |
| "reward_std": 0.5721786022186279, | |
| "rewards/rollout_reward_func/mean": -0.5567976832389832, | |
| "rewards/rollout_reward_func/std": 0.6177526116371155, | |
| "sampling/importance_sampling_ratio/max": 0.19887018203735352, | |
| "sampling/importance_sampling_ratio/mean": 0.06785006076097488, | |
| "sampling/importance_sampling_ratio/min": 6.38808743654907e-15, | |
| "sampling/sampling_logp_difference/max": 4.993244647979736, | |
| "sampling/sampling_logp_difference/mean": 1.6287214756011963, | |
| "step": 25, | |
| "step_time": 5.136989613000878 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.775655627250671, | |
| "epoch": 0.00026, | |
| "grad_norm": 0.051532234996557236, | |
| "kl": 0.13951302412897348, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": -0.0031, | |
| "step": 26, | |
| "step_time": 2.7561433119990397 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 15.0, | |
| "completions/max_terminated_length": 15.0, | |
| "completions/mean_length": 2.40625, | |
| "completions/mean_terminated_length": 2.40625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.139402210712433, | |
| "epoch": 0.00027, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1159820482134819, | |
| "kl": 0.21849708445370197, | |
| "learning_rate": 7.428571428571429e-06, | |
| "loss": -0.0033, | |
| "num_tokens": 488130.0, | |
| "reward": 0.0451994352042675, | |
| "reward_std": 0.5705078840255737, | |
| "rewards/rollout_reward_func/mean": 0.0451994352042675, | |
| "rewards/rollout_reward_func/std": 0.7002898454666138, | |
| "sampling/importance_sampling_ratio/max": 0.21661195158958435, | |
| "sampling/importance_sampling_ratio/mean": 0.11537822335958481, | |
| "sampling/importance_sampling_ratio/min": 1.9111427718154772e-11, | |
| "sampling/sampling_logp_difference/max": 5.494892120361328, | |
| "sampling/sampling_logp_difference/mean": 1.4119889736175537, | |
| "step": 27, | |
| "step_time": 5.695921420998275 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.015625, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03125, | |
| "entropy": 7.091515064239502, | |
| "epoch": 0.00028, | |
| "grad_norm": 0.05888032913208008, | |
| "kl": 0.19798127934336662, | |
| "learning_rate": 7.714285714285716e-06, | |
| "loss": -0.0036, | |
| "step": 28, | |
| "step_time": 3.262695406999228 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 12.0, | |
| "completions/mean_length": 2.84375, | |
| "completions/mean_terminated_length": 2.4193546772003174, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.829086482524872, | |
| "epoch": 0.00029, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.07569188624620438, | |
| "kl": 0.3484500087797642, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": -0.0047, | |
| "num_tokens": 523810.0, | |
| "reward": 0.25646376609802246, | |
| "reward_std": 0.19004222750663757, | |
| "rewards/rollout_reward_func/mean": 0.25646376609802246, | |
| "rewards/rollout_reward_func/std": 0.5962904691696167, | |
| "sampling/importance_sampling_ratio/max": 0.2295530140399933, | |
| "sampling/importance_sampling_ratio/mean": 0.12002766132354736, | |
| "sampling/importance_sampling_ratio/min": 3.580996610352827e-10, | |
| "sampling/sampling_logp_difference/max": 4.408937454223633, | |
| "sampling/sampling_logp_difference/mean": 1.4061723947525024, | |
| "step": 29, | |
| "step_time": 5.195359340000323 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.82170957326889, | |
| "epoch": 0.0003, | |
| "grad_norm": 0.072876937687397, | |
| "kl": 0.3563471883535385, | |
| "learning_rate": 8.285714285714287e-06, | |
| "loss": -0.0048, | |
| "step": 30, | |
| "step_time": 2.7664619609995498 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.4375, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.176153600215912, | |
| "epoch": 0.00031, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.012421791441738605, | |
| "kl": 0.19252930209040642, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": -0.0031, | |
| "num_tokens": 562344.0, | |
| "reward": 0.007906697690486908, | |
| "reward_std": 0.25298231840133667, | |
| "rewards/rollout_reward_func/mean": 0.007906697690486908, | |
| "rewards/rollout_reward_func/std": 0.34386366605758667, | |
| "sampling/importance_sampling_ratio/max": 0.2527497410774231, | |
| "sampling/importance_sampling_ratio/mean": 0.12306913733482361, | |
| "sampling/importance_sampling_ratio/min": 1.3953619618263524e-13, | |
| "sampling/sampling_logp_difference/max": 4.638323783874512, | |
| "sampling/sampling_logp_difference/mean": 1.436547040939331, | |
| "step": 31, | |
| "step_time": 5.2235907870026494 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.177456617355347, | |
| "epoch": 0.00032, | |
| "grad_norm": 0.011528298258781433, | |
| "kl": 0.19802813418209553, | |
| "learning_rate": 8.857142857142858e-06, | |
| "loss": -0.0031, | |
| "step": 32, | |
| "step_time": 2.7817133249991457 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013888888992369175, | |
| "clip_ratio/high_mean": 0.0069444444961845875, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0069444444961845875, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.875, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.5588818192481995, | |
| "epoch": 0.00033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.019391389563679695, | |
| "kl": 0.19253090023994446, | |
| "learning_rate": 9.142857142857144e-06, | |
| "loss": -0.0068, | |
| "num_tokens": 596903.0, | |
| "reward": -0.23663240671157837, | |
| "reward_std": 0.7337036728858948, | |
| "rewards/rollout_reward_func/mean": -0.23663240671157837, | |
| "rewards/rollout_reward_func/std": 0.7588505148887634, | |
| "sampling/importance_sampling_ratio/max": 0.24936437606811523, | |
| "sampling/importance_sampling_ratio/mean": 0.09548068046569824, | |
| "sampling/importance_sampling_ratio/min": 2.8611614813489616e-11, | |
| "sampling/sampling_logp_difference/max": 4.682908058166504, | |
| "sampling/sampling_logp_difference/mean": 1.4331482648849487, | |
| "step": 33, | |
| "step_time": 5.650446067998928 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.045138888992369175, | |
| "clip_ratio/high_mean": 0.022569444496184587, | |
| "clip_ratio/low_mean": 0.015625, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03819444449618459, | |
| "entropy": 7.561095893383026, | |
| "epoch": 0.00034, | |
| "grad_norm": 0.0243048295378685, | |
| "kl": 0.2237775418907404, | |
| "learning_rate": 9.42857142857143e-06, | |
| "loss": -0.0068, | |
| "step": 34, | |
| "step_time": 3.267381875997671 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.21353954076767, | |
| "epoch": 0.00035, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.03242993354797363, | |
| "kl": 0.23187845014035702, | |
| "learning_rate": 9.714285714285715e-06, | |
| "loss": -0.0051, | |
| "num_tokens": 630878.0, | |
| "reward": 0.07425153255462646, | |
| "reward_std": 0.39872053265571594, | |
| "rewards/rollout_reward_func/mean": 0.07425153255462646, | |
| "rewards/rollout_reward_func/std": 0.5568375587463379, | |
| "sampling/importance_sampling_ratio/max": 0.26548439264297485, | |
| "sampling/importance_sampling_ratio/mean": 0.12363378703594208, | |
| "sampling/importance_sampling_ratio/min": 0.00769618758931756, | |
| "sampling/sampling_logp_difference/max": 3.031831979751587, | |
| "sampling/sampling_logp_difference/mean": 1.3642754554748535, | |
| "step": 35, | |
| "step_time": 5.206078858000183 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.159900069236755, | |
| "epoch": 0.00036, | |
| "grad_norm": 0.0284445621073246, | |
| "kl": 0.249108312651515, | |
| "learning_rate": 1e-05, | |
| "loss": -0.0052, | |
| "step": 36, | |
| "step_time": 2.761536221001734 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 5.0, | |
| "completions/mean_length": 2.53125, | |
| "completions/mean_terminated_length": 2.096774101257324, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.710843145847321, | |
| "epoch": 0.00037, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.012415582314133644, | |
| "kl": 0.21114499121904373, | |
| "learning_rate": 9.9999999995372e-06, | |
| "loss": -0.0052, | |
| "num_tokens": 668297.0, | |
| "reward": -0.05626409500837326, | |
| "reward_std": 0.34207823872566223, | |
| "rewards/rollout_reward_func/mean": -0.05626409500837326, | |
| "rewards/rollout_reward_func/std": 0.40576615929603577, | |
| "sampling/importance_sampling_ratio/max": 0.28296810388565063, | |
| "sampling/importance_sampling_ratio/mean": 0.16521048545837402, | |
| "sampling/importance_sampling_ratio/min": 7.181252052973312e-15, | |
| "sampling/sampling_logp_difference/max": 3.9721055030822754, | |
| "sampling/sampling_logp_difference/mean": 1.2990117073059082, | |
| "step": 37, | |
| "step_time": 5.382229439999719 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 6.636479735374451, | |
| "epoch": 0.00038, | |
| "grad_norm": 0.011466137133538723, | |
| "kl": 0.22695374488830566, | |
| "learning_rate": 9.999999998148802e-06, | |
| "loss": -0.0053, | |
| "step": 38, | |
| "step_time": 2.8060927069982426 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.019097222248092294, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.019097222248092294, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 5.0, | |
| "completions/mean_length": 3.40625, | |
| "completions/mean_terminated_length": 2.1034481525421143, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.852519392967224, | |
| "epoch": 0.00039, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024172235280275345, | |
| "kl": 0.25472032092511654, | |
| "learning_rate": 9.999999995834804e-06, | |
| "loss": -0.009, | |
| "num_tokens": 704978.0, | |
| "reward": -0.23096521198749542, | |
| "reward_std": 0.4871903955936432, | |
| "rewards/rollout_reward_func/mean": -0.23096521198749542, | |
| "rewards/rollout_reward_func/std": 0.5004134774208069, | |
| "sampling/importance_sampling_ratio/max": 0.2927253544330597, | |
| "sampling/importance_sampling_ratio/mean": 0.1400977373123169, | |
| "sampling/importance_sampling_ratio/min": 3.20386295618591e-13, | |
| "sampling/sampling_logp_difference/max": 4.607293128967285, | |
| "sampling/sampling_logp_difference/mean": 1.322880744934082, | |
| "step": 39, | |
| "step_time": 5.8398218920001455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0034722222480922937, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "entropy": 6.7819119691848755, | |
| "epoch": 0.0004, | |
| "grad_norm": 0.023572752252221107, | |
| "kl": 0.2616057936102152, | |
| "learning_rate": 9.999999992595207e-06, | |
| "loss": -0.0092, | |
| "step": 40, | |
| "step_time": 2.772617318999437 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.03125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03125, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.4375, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.629358530044556, | |
| "epoch": 0.00041, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.021721255034208298, | |
| "kl": 0.3800278306007385, | |
| "learning_rate": 9.999999988430008e-06, | |
| "loss": -0.0068, | |
| "num_tokens": 739943.0, | |
| "reward": -0.0039390698075294495, | |
| "reward_std": 0.5532426238059998, | |
| "rewards/rollout_reward_func/mean": -0.0039390698075294495, | |
| "rewards/rollout_reward_func/std": 0.6965489983558655, | |
| "sampling/importance_sampling_ratio/max": 0.31811973452568054, | |
| "sampling/importance_sampling_ratio/mean": 0.16932719945907593, | |
| "sampling/importance_sampling_ratio/min": 2.988759240096783e-11, | |
| "sampling/sampling_logp_difference/max": 3.9750123023986816, | |
| "sampling/sampling_logp_difference/mean": 1.231735110282898, | |
| "step": 41, | |
| "step_time": 5.137522204998277 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.514784038066864, | |
| "epoch": 0.00042, | |
| "grad_norm": 0.016385503113269806, | |
| "kl": 0.3676511310040951, | |
| "learning_rate": 9.999999983339212e-06, | |
| "loss": -0.0069, | |
| "step": 42, | |
| "step_time": 2.7499034580005173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.131894528865814, | |
| "epoch": 0.00043, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.021295806393027306, | |
| "kl": 0.3743965122848749, | |
| "learning_rate": 9.999999977322818e-06, | |
| "loss": -0.0082, | |
| "num_tokens": 775585.0, | |
| "reward": -0.1512046605348587, | |
| "reward_std": 0.5463583469390869, | |
| "rewards/rollout_reward_func/mean": -0.1512046605348587, | |
| "rewards/rollout_reward_func/std": 0.6555477976799011, | |
| "sampling/importance_sampling_ratio/max": 0.33739665150642395, | |
| "sampling/importance_sampling_ratio/mean": 0.2010621726512909, | |
| "sampling/importance_sampling_ratio/min": 0.013526393100619316, | |
| "sampling/sampling_logp_difference/max": 2.7704052925109863, | |
| "sampling/sampling_logp_difference/mean": 1.0442215204238892, | |
| "step": 43, | |
| "step_time": 4.971326774000772 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 6.021390020847321, | |
| "epoch": 0.00044, | |
| "grad_norm": 0.024328157305717468, | |
| "kl": 0.3970394376665354, | |
| "learning_rate": 9.999999970380822e-06, | |
| "loss": -0.0084, | |
| "step": 44, | |
| "step_time": 2.748106813000959 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.4375, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.561417460441589, | |
| "epoch": 0.00045, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0709402859210968, | |
| "kl": 0.5104918628931046, | |
| "learning_rate": 9.999999962513228e-06, | |
| "loss": -0.0063, | |
| "num_tokens": 813291.0, | |
| "reward": 0.015873141586780548, | |
| "reward_std": 0.19914725422859192, | |
| "rewards/rollout_reward_func/mean": 0.015873141586780548, | |
| "rewards/rollout_reward_func/std": 0.27820026874542236, | |
| "sampling/importance_sampling_ratio/max": 0.35048311948776245, | |
| "sampling/importance_sampling_ratio/mean": 0.24818521738052368, | |
| "sampling/importance_sampling_ratio/min": 1.603953254936119e-11, | |
| "sampling/sampling_logp_difference/max": 2.631711959838867, | |
| "sampling/sampling_logp_difference/mean": 0.938794732093811, | |
| "step": 45, | |
| "step_time": 5.729797389999476 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.534043729305267, | |
| "epoch": 0.00046, | |
| "grad_norm": 0.07074378430843353, | |
| "kl": 0.5150652844458818, | |
| "learning_rate": 9.999999953720035e-06, | |
| "loss": -0.0064, | |
| "step": 46, | |
| "step_time": 2.7925665359998675 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.875, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.189878046512604, | |
| "epoch": 0.00047, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.018252786248922348, | |
| "kl": 0.3964756764471531, | |
| "learning_rate": 9.99999994400124e-06, | |
| "loss": -0.0065, | |
| "num_tokens": 850551.0, | |
| "reward": -0.0004236232489347458, | |
| "reward_std": 0.22690175473690033, | |
| "rewards/rollout_reward_func/mean": -0.0004236232489347458, | |
| "rewards/rollout_reward_func/std": 0.31507667899131775, | |
| "sampling/importance_sampling_ratio/max": 0.36628416180610657, | |
| "sampling/importance_sampling_ratio/mean": 0.19846494495868683, | |
| "sampling/importance_sampling_ratio/min": 1.2659386039448606e-10, | |
| "sampling/sampling_logp_difference/max": 4.142038822174072, | |
| "sampling/sampling_logp_difference/mean": 1.0495840311050415, | |
| "step": 47, | |
| "step_time": 5.31281194299936 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.272528171539307, | |
| "epoch": 0.00048, | |
| "grad_norm": 0.01557189505547285, | |
| "kl": 0.37549079209566116, | |
| "learning_rate": 9.999999933356848e-06, | |
| "loss": -0.0065, | |
| "step": 48, | |
| "step_time": 2.7983943749995888 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.4375, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.748881280422211, | |
| "epoch": 0.00049, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02466733008623123, | |
| "kl": 0.4844143260270357, | |
| "learning_rate": 9.999999921786855e-06, | |
| "loss": -0.0128, | |
| "num_tokens": 888151.0, | |
| "reward": -0.03596695885062218, | |
| "reward_std": 0.2966480255126953, | |
| "rewards/rollout_reward_func/mean": -0.03596695885062218, | |
| "rewards/rollout_reward_func/std": 0.41368940472602844, | |
| "sampling/importance_sampling_ratio/max": 0.3809582591056824, | |
| "sampling/importance_sampling_ratio/mean": 0.24179524183273315, | |
| "sampling/importance_sampling_ratio/min": 1.7608540181512922e-09, | |
| "sampling/sampling_logp_difference/max": 3.226844310760498, | |
| "sampling/sampling_logp_difference/mean": 1.0408389568328857, | |
| "step": 49, | |
| "step_time": 5.312362946001485 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.03125, | |
| "clip_ratio/low_min": 0.03125, | |
| "clip_ratio/region_mean": 0.03125, | |
| "entropy": 5.748369365930557, | |
| "epoch": 0.0005, | |
| "grad_norm": 0.022601323202252388, | |
| "kl": 0.5012617446482182, | |
| "learning_rate": 9.999999909291265e-06, | |
| "loss": -0.013, | |
| "step": 50, | |
| "step_time": 3.2023114370003896 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.002339065074921, | |
| "epoch": 0.00051, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.017942914739251137, | |
| "kl": 0.3239247240126133, | |
| "learning_rate": 9.999999895870075e-06, | |
| "loss": -0.0091, | |
| "num_tokens": 926183.0, | |
| "reward": 0.09451328217983246, | |
| "reward_std": 0.026711096987128258, | |
| "rewards/rollout_reward_func/mean": 0.09451328217983246, | |
| "rewards/rollout_reward_func/std": 0.038156598806381226, | |
| "sampling/importance_sampling_ratio/max": 0.39242714643478394, | |
| "sampling/importance_sampling_ratio/mean": 0.2970275282859802, | |
| "sampling/importance_sampling_ratio/min": 0.010224375873804092, | |
| "sampling/sampling_logp_difference/max": 3.037482500076294, | |
| "sampling/sampling_logp_difference/mean": 0.8098665475845337, | |
| "step": 51, | |
| "step_time": 5.696433805002016 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.001863986253738, | |
| "epoch": 0.00052, | |
| "grad_norm": 0.016663307324051857, | |
| "kl": 0.3272556010633707, | |
| "learning_rate": 9.999999881523285e-06, | |
| "loss": -0.0093, | |
| "step": 52, | |
| "step_time": 2.7697423050003636 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2.0, | |
| "completions/max_terminated_length": 2.0, | |
| "completions/mean_length": 2.0, | |
| "completions/mean_terminated_length": 2.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.3582839369773865, | |
| "epoch": 0.00053, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04750742390751839, | |
| "kl": 0.5478413961827755, | |
| "learning_rate": 9.999999866250896e-06, | |
| "loss": -0.0016, | |
| "num_tokens": 960803.0, | |
| "reward": -0.5398619174957275, | |
| "reward_std": 0.47637394070625305, | |
| "rewards/rollout_reward_func/mean": -0.5398619174957275, | |
| "rewards/rollout_reward_func/std": 0.5747888684272766, | |
| "sampling/importance_sampling_ratio/max": 0.4040625989437103, | |
| "sampling/importance_sampling_ratio/mean": 0.21569415926933289, | |
| "sampling/importance_sampling_ratio/min": 0.005049743689596653, | |
| "sampling/sampling_logp_difference/max": 3.7417404651641846, | |
| "sampling/sampling_logp_difference/mean": 1.302799940109253, | |
| "step": 53, | |
| "step_time": 5.149267007999697 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.336145222187042, | |
| "epoch": 0.00054, | |
| "grad_norm": 0.038806021213531494, | |
| "kl": 0.5272092521190643, | |
| "learning_rate": 9.999999850052909e-06, | |
| "loss": -0.0016, | |
| "step": 54, | |
| "step_time": 2.742318255999635 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 4.0, | |
| "completions/mean_length": 2.5, | |
| "completions/mean_terminated_length": 2.064516067504883, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.6335746347904205, | |
| "epoch": 0.00055, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029065297916531563, | |
| "kl": 0.5286761000752449, | |
| "learning_rate": 9.99999983292932e-06, | |
| "loss": -0.0135, | |
| "num_tokens": 998217.0, | |
| "reward": -0.1721905618906021, | |
| "reward_std": 0.5463275909423828, | |
| "rewards/rollout_reward_func/mean": -0.1721905618906021, | |
| "rewards/rollout_reward_func/std": 0.5355534553527832, | |
| "sampling/importance_sampling_ratio/max": 0.41675877571105957, | |
| "sampling/importance_sampling_ratio/mean": 0.2475878894329071, | |
| "sampling/importance_sampling_ratio/min": 2.238332399429055e-07, | |
| "sampling/sampling_logp_difference/max": 3.269535541534424, | |
| "sampling/sampling_logp_difference/mean": 1.0780588388442993, | |
| "step": 55, | |
| "step_time": 5.403120343999035 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03125, | |
| "clip_ratio/high_mean": 0.015625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.015625, | |
| "entropy": 5.5938030779361725, | |
| "epoch": 0.00056, | |
| "grad_norm": 0.02255568467080593, | |
| "kl": 0.4606229439377785, | |
| "learning_rate": 9.999999814880132e-06, | |
| "loss": -0.0136, | |
| "step": 56, | |
| "step_time": 3.2759828810003455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 6.0, | |
| "completions/mean_length": 3.0, | |
| "completions/mean_terminated_length": 2.133333444595337, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 5.78191477060318, | |
| "epoch": 0.00057, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01589336432516575, | |
| "kl": 0.42849645018577576, | |
| "learning_rate": 9.999999795905347e-06, | |
| "loss": -0.009, | |
| "num_tokens": 1031529.0, | |
| "reward": -0.07622712850570679, | |
| "reward_std": 0.6089849472045898, | |
| "rewards/rollout_reward_func/mean": -0.07622712850570679, | |
| "rewards/rollout_reward_func/std": 1.0020984411239624, | |
| "sampling/importance_sampling_ratio/max": 0.425922691822052, | |
| "sampling/importance_sampling_ratio/mean": 0.229965940117836, | |
| "sampling/importance_sampling_ratio/min": 6.920892747785956e-09, | |
| "sampling/sampling_logp_difference/max": 3.239529609680176, | |
| "sampling/sampling_logp_difference/mean": 1.154854416847229, | |
| "step": 57, | |
| "step_time": 5.6268242130017825 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 5.738904058933258, | |
| "epoch": 0.00058, | |
| "grad_norm": 0.013298124074935913, | |
| "kl": 0.3968820311129093, | |
| "learning_rate": 9.999999776004962e-06, | |
| "loss": -0.009, | |
| "step": 58, | |
| "step_time": 2.7349948540004334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16.0, | |
| "completions/max_terminated_length": 13.0, | |
| "completions/mean_length": 3.21875, | |
| "completions/mean_terminated_length": 2.366666793823242, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 6.8580867648124695, | |
| "epoch": 0.00059, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.013808239251375198, | |
| "kl": 0.29788459837436676, | |
| "learning_rate": 9.999999755178978e-06, | |
| "loss": -0.0089, | |
| "num_tokens": 1065697.0, | |
| "reward": -0.1516963541507721, | |
| "reward_std": 0.6373498439788818, | |
| "rewards/rollout_reward_func/mean": -0.1516963541507721, | |
| "rewards/rollout_reward_func/std": 0.7994406819343567, | |
| "sampling/importance_sampling_ratio/max": 0.4323439598083496, | |
| "sampling/importance_sampling_ratio/mean": 0.16014467179775238, | |
| "sampling/importance_sampling_ratio/min": 3.285566610444768e-11, | |
| "sampling/sampling_logp_difference/max": 3.304318428039551, | |
| "sampling/sampling_logp_difference/mean": 1.2896037101745605, | |
| "step": 59, | |
| "step_time": 5.172425778999241 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 6.81160169839859, | |
| "epoch": 0.0006, | |
| "grad_norm": 0.012560456059873104, | |
| "kl": 0.2846912257373333, | |
| "learning_rate": 9.999999733427394e-06, | |
| "loss": -0.0089, | |
| "step": 60, | |
| "step_time": 2.6920222089984236 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 200000, | |
| "num_input_tokens_seen": 1065697, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |