Instructions to use bimabk/test-env-gin-rummy with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries
PEFT
How to use bimabk/test-env-gin-rummy with PEFT:
```
Base model is not found.
```

How to use bimabk/test-env-gin-rummy with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="bimabk/test-env-gin-rummy")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("bimabk/test-env-gin-rummy", dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use bimabk/test-env-gin-rummy with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "bimabk/test-env-gin-rummy"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "bimabk/test-env-gin-rummy",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/bimabk/test-env-gin-rummy

SGLang

How to use bimabk/test-env-gin-rummy with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "bimabk/test-env-gin-rummy" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "bimabk/test-env-gin-rummy",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "bimabk/test-env-gin-rummy" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "bimabk/test-env-gin-rummy",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use bimabk/test-env-gin-rummy with Docker Model Runner:
```
docker model run hf.co/bimabk/test-env-gin-rummy
```

test-env-gin-rummy / trainer_state.json

bimabk

Upload task output 1

015345c verified about 2 months ago

raw

history blame contribute delete

365 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.006,
	"eval_steps": 500,
	"global_step": 300,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06598061177646741,
	"epoch": 2e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003923534415662289,
	"kl": 0.0,
	"learning_rate": 0.0,
	"loss": -0.0,
	"num_tokens": 102665.0,
	"reward": 2.355022430419922,
	"reward_std": 0.3552054464817047,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.501227617263794,
	"rewards/rollout_reward_func/std": 0.18640437722206116,
	"sampling/importance_sampling_ratio/max": 1.0961512327194214,
	"sampling/importance_sampling_ratio/mean": 0.9703092575073242,
	"sampling/importance_sampling_ratio/min": 0.5060414671897888,
	"sampling/sampling_logp_difference/max": 0.6756159067153931,
	"sampling/sampling_logp_difference/mean": 0.0183907151222229,
	"step": 1,
	"step_time": 29.075429260999726
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.06598061177646741,
	"epoch": 4e-05,
	"grad_norm": 0.003917683847248554,
	"kl": 0.0,
	"learning_rate": 2.2857142857142855e-07,
	"loss": -0.0,
	"step": 2,
	"step_time": 11.468670933999988
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05838002988048174,
	"epoch": 6e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004170146305114031,
	"kl": 0.0014184596652553338,
	"learning_rate": 4.571428571428571e-07,
	"loss": 0.0,
	"num_tokens": 205842.0,
	"reward": 2.2323365211486816,
	"reward_std": 0.41563019156455994,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.970565140247345,
	"rewards/probe_shaping_dominance/std": 0.11582481861114502,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.5944784879684448,
	"rewards/rollout_reward_func/std": 0.19796565175056458,
	"sampling/importance_sampling_ratio/max": 1.4160258769989014,
	"sampling/importance_sampling_ratio/mean": 1.0286931991577148,
	"sampling/importance_sampling_ratio/min": 0.8523033857345581,
	"sampling/sampling_logp_difference/max": 0.34715062379837036,
	"sampling/sampling_logp_difference/mean": 0.01565416157245636,
	"step": 3,
	"step_time": 26.976024578999954
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.06273448248884961,
	"epoch": 8e-05,
	"grad_norm": 0.0025308942422270775,
	"kl": 0.004324701569430545,
	"learning_rate": 6.857142857142857e-07,
	"loss": 0.0,
	"step": 4,
	"step_time": 12.765090235000116
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06146455561975017,
	"epoch": 0.0001,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010321667417883873,
	"kl": 0.005618094519680539,
	"learning_rate": 9.142857142857142e-07,
	"loss": 0.0,
	"num_tokens": 303571.0,
	"reward": 2.236471176147461,
	"reward_std": 0.5468828678131104,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.420013427734375,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9439389705657959,
	"rewards/probe_shaping_dominance/std": 0.15084654092788696,
	"rewards/probe_terminal_raw/mean": 0.0625,
	"rewards/probe_terminal_raw/std": 0.16800537705421448,
	"rewards/rollout_reward_func/mean": -0.5324676036834717,
	"rewards/rollout_reward_func/std": 0.24024422466754913,
	"sampling/importance_sampling_ratio/max": 1.3134887218475342,
	"sampling/importance_sampling_ratio/mean": 0.9676171541213989,
	"sampling/importance_sampling_ratio/min": 0.41273218393325806,
	"sampling/sampling_logp_difference/max": 0.8849565982818604,
	"sampling/sampling_logp_difference/mean": 0.026659058406949043,
	"step": 5,
	"step_time": 26.660728665999955
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.06810211515403353,
	"epoch": 0.00012,
	"grad_norm": 0.007714552339166403,
	"kl": 0.0028154569756466685,
	"learning_rate": 1.1428571428571428e-06,
	"loss": 0.0,
	"step": 6,
	"step_time": 11.44768754599977
	},
	{
	"clip_ratio/high_max": 0.06250000186264515,
	"clip_ratio/high_mean": 0.031250000931322575,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04115386162811774,
	"epoch": 0.00014,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003323962911963463,
	"kl": 0.001360555283525855,
	"learning_rate": 1.3714285714285715e-06,
	"loss": 0.0,
	"num_tokens": 410424.0,
	"reward": 2.2917943000793457,
	"reward_std": 0.44559940695762634,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9733562469482422,
	"rewards/probe_shaping_dominance/std": 0.10957542806863785,
	"rewards/probe_terminal_raw/mean": 0.027566056698560715,
	"rewards/probe_terminal_raw/std": 0.10949952900409698,
	"rewards/rollout_reward_func/mean": -0.5341278314590454,
	"rewards/rollout_reward_func/std": 0.27136242389678955,
	"sampling/importance_sampling_ratio/max": 1.0618572235107422,
	"sampling/importance_sampling_ratio/mean": 0.9585317969322205,
	"sampling/importance_sampling_ratio/min": 0.2324376255273819,
	"sampling/sampling_logp_difference/max": 1.470571756362915,
	"sampling/sampling_logp_difference/mean": 0.02589060366153717,
	"step": 7,
	"step_time": 27.5007078120002
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.04836703218097682,
	"epoch": 0.00016,
	"grad_norm": 0.005415800027549267,
	"kl": 0.001694043724171479,
	"learning_rate": 1.6e-06,
	"loss": 0.0,
	"step": 8,
	"step_time": 12.223170772000117
	},
	{
	"clip_ratio/high_max": 0.06250000186264515,
	"clip_ratio/high_mean": 0.031250000931322575,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07847535189284827,
	"epoch": 0.00018,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010716816410422325,
	"kl": 0.004078912243130617,
	"learning_rate": 1.8285714285714284e-06,
	"loss": -0.0,
	"num_tokens": 511562.0,
	"reward": 2.3355042934417725,
	"reward_std": 0.43706634640693665,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9753304123878479,
	"rewards/probe_shaping_dominance/std": 0.0984005331993103,
	"rewards/probe_terminal_raw/mean": 0.026295732706785202,
	"rewards/probe_terminal_raw/std": 0.10541322082281113,
	"rewards/rollout_reward_func/mean": -0.553621768951416,
	"rewards/rollout_reward_func/std": 0.20992274582386017,
	"sampling/importance_sampling_ratio/max": 2.0806119441986084,
	"sampling/importance_sampling_ratio/mean": 1.0222396850585938,
	"sampling/importance_sampling_ratio/min": 0.5085986256599426,
	"sampling/sampling_logp_difference/max": 0.7373225688934326,
	"sampling/sampling_logp_difference/mean": 0.028744252398610115,
	"step": 9,
	"step_time": 26.567318749999913
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.07049791459576227,
	"epoch": 0.0002,
	"grad_norm": 0.004469083622097969,
	"kl": 0.026501665124972873,
	"learning_rate": 2.057142857142857e-06,
	"loss": -0.0,
	"step": 10,
	"step_time": 11.64468052799998
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05360435344118741,
	"epoch": 0.00022,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008809314109385014,
	"kl": 0.004907062985087585,
	"learning_rate": 2.2857142857142856e-06,
	"loss": -0.0,
	"num_tokens": 616201.0,
	"reward": 2.4397201538085938,
	"reward_std": 0.5087255239486694,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 0.9706827998161316,
	"rewards/probe_shaping_dominance/std": 0.12301044911146164,
	"rewards/probe_terminal_raw/mean": 0.025406504049897194,
	"rewards/probe_terminal_raw/std": 0.10275532305240631,
	"rewards/rollout_reward_func/mean": -0.5063689351081848,
	"rewards/rollout_reward_func/std": 0.27631497383117676,
	"sampling/importance_sampling_ratio/max": 1.1329089403152466,
	"sampling/importance_sampling_ratio/mean": 0.9933090806007385,
	"sampling/importance_sampling_ratio/min": 0.768523633480072,
	"sampling/sampling_logp_difference/max": 0.2632848620414734,
	"sampling/sampling_logp_difference/mean": 0.007937189191579819,
	"step": 11,
	"step_time": 27.777191968999887
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.056653952024134924,
	"epoch": 0.00024,
	"grad_norm": 0.005528156645596027,
	"kl": 0.0032436020156101364,
	"learning_rate": 2.5142857142857142e-06,
	"loss": -0.0,
	"step": 12,
	"step_time": 11.55833436900025
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.038703071273630485,
	"epoch": 0.00026,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007530162110924721,
	"kl": 0.09287417630221206,
	"learning_rate": 2.742857142857143e-06,
	"loss": -0.0,
	"num_tokens": 724364.0,
	"reward": 2.351245880126953,
	"reward_std": 0.4424680173397064,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9703531265258789,
	"rewards/probe_shaping_dominance/std": 0.11669508367776871,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.47535717487335205,
	"rewards/rollout_reward_func/std": 0.24601998925209045,
	"sampling/importance_sampling_ratio/max": 1.440869688987732,
	"sampling/importance_sampling_ratio/mean": 1.0093717575073242,
	"sampling/importance_sampling_ratio/min": 0.7920892238616943,
	"sampling/sampling_logp_difference/max": 0.3652459681034088,
	"sampling/sampling_logp_difference/mean": 0.008522224612534046,
	"step": 13,
	"step_time": 27.311093626999764
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.0453864433511626,
	"epoch": 0.00028,
	"grad_norm": 0.006435270421206951,
	"kl": 0.010504724175871893,
	"learning_rate": 2.9714285714285716e-06,
	"loss": -0.0,
	"step": 14,
	"step_time": 11.88281524700028
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.037373697148723295,
	"epoch": 0.0003,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006931572686880827,
	"kl": 0.0013499163329698805,
	"learning_rate": 3.2e-06,
	"loss": -0.0,
	"num_tokens": 828160.0,
	"reward": 2.3457703590393066,
	"reward_std": 0.32655069231987,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9890751838684082,
	"rewards/probe_shaping_dominance/std": 0.06179998442530632,
	"rewards/probe_terminal_raw/mean": 0.01092479657381773,
	"rewards/probe_terminal_raw/std": 0.06179998070001602,
	"rewards/rollout_reward_func/mean": -0.5417294502258301,
	"rewards/rollout_reward_func/std": 0.19428227841854095,
	"sampling/importance_sampling_ratio/max": 1.5512616634368896,
	"sampling/importance_sampling_ratio/mean": 1.0071200132369995,
	"sampling/importance_sampling_ratio/min": 0.7788013219833374,
	"sampling/sampling_logp_difference/max": 0.43915224075317383,
	"sampling/sampling_logp_difference/mean": 0.008885648101568222,
	"step": 15,
	"step_time": 27.3166221219999
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.03605123230954632,
	"epoch": 0.00032,
	"grad_norm": 0.007162998430430889,
	"kl": 0.0005329122045578671,
	"learning_rate": 3.428571428571428e-06,
	"loss": -0.0,
	"step": 16,
	"step_time": 12.10146868100037
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04818721191259101,
	"epoch": 0.00034,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0020011591259390116,
	"kl": 0.000880227197208705,
	"learning_rate": 3.657142857142857e-06,
	"loss": 0.0,
	"num_tokens": 933852.0,
	"reward": 2.2396738529205322,
	"reward_std": 0.3769412934780121,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9811877012252808,
	"rewards/probe_shaping_dominance/std": 0.07417813688516617,
	"rewards/probe_terminal_raw/mean": 0.019435975700616837,
	"rewards/probe_terminal_raw/std": 0.07648143172264099,
	"rewards/rollout_reward_func/mean": -0.554699718952179,
	"rewards/rollout_reward_func/std": 0.14253978431224823,
	"sampling/importance_sampling_ratio/max": 1.3911144733428955,
	"sampling/importance_sampling_ratio/mean": 1.0014019012451172,
	"sampling/importance_sampling_ratio/min": 0.647373378276825,
	"sampling/sampling_logp_difference/max": 0.4348297119140625,
	"sampling/sampling_logp_difference/mean": 0.01693439856171608,
	"step": 17,
	"step_time": 27.466287271999818
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.059825868549523875,
	"epoch": 0.00036,
	"grad_norm": 0.00400462094694376,
	"kl": 0.0010442571770683529,
	"learning_rate": 3.885714285714286e-06,
	"loss": 0.0,
	"step": 18,
	"step_time": 11.729434232999665
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07400128486915492,
	"epoch": 0.00038,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004407655913382769,
	"kl": 0.0058712156430829054,
	"learning_rate": 4.114285714285714e-06,
	"loss": -0.0,
	"num_tokens": 1040669.0,
	"reward": 2.3979897499084473,
	"reward_std": 0.3378089666366577,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9721384644508362,
	"rewards/probe_shaping_dominance/std": 0.10964522510766983,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.4928986430168152,
	"rewards/rollout_reward_func/std": 0.280559241771698,
	"sampling/importance_sampling_ratio/max": 1.2489417791366577,
	"sampling/importance_sampling_ratio/mean": 0.9779143333435059,
	"sampling/importance_sampling_ratio/min": 0.5380392670631409,
	"sampling/sampling_logp_difference/max": 0.619827151298523,
	"sampling/sampling_logp_difference/mean": 0.017949596047401428,
	"step": 19,
	"step_time": 28.172511434999933
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.07153313838352915,
	"epoch": 0.0004,
	"grad_norm": 0.010058136656880379,
	"kl": 0.01704683385832595,
	"learning_rate": 4.342857142857142e-06,
	"loss": -0.0,
	"step": 20,
	"step_time": 11.798744088000149
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07958520320244133,
	"epoch": 0.00042,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.015031801536679268,
	"kl": 0.02134023218428638,
	"learning_rate": 4.571428571428571e-06,
	"loss": 0.0,
	"num_tokens": 1146440.0,
	"reward": 2.2259719371795654,
	"reward_std": 0.4264923334121704,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.420013427734375,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9849475622177124,
	"rewards/probe_shaping_dominance/std": 0.08514932543039322,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.505850613117218,
	"rewards/rollout_reward_func/std": 0.22946372628211975,
	"sampling/importance_sampling_ratio/max": 1.8730424642562866,
	"sampling/importance_sampling_ratio/mean": 1.0450382232666016,
	"sampling/importance_sampling_ratio/min": 0.6261028051376343,
	"sampling/sampling_logp_difference/max": 0.6275629997253418,
	"sampling/sampling_logp_difference/mean": 0.033233314752578735,
	"step": 21,
	"step_time": 27.33938806800029
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.08325157128274441,
	"epoch": 0.00044,
	"grad_norm": 0.01334489043802023,
	"kl": 0.01684667149083907,
	"learning_rate": 4.8e-06,
	"loss": 0.0,
	"step": 22,
	"step_time": 11.840854454999999
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06853798200609162,
	"epoch": 0.00046,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.01755128987133503,
	"kl": 0.003467819899402258,
	"learning_rate": 5.0285714285714285e-06,
	"loss": 0.0001,
	"num_tokens": 1248638.0,
	"reward": 2.270667552947998,
	"reward_std": 0.47502174973487854,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.978266716003418,
	"rewards/probe_shaping_dominance/std": 0.08902076631784439,
	"rewards/probe_terminal_raw/mean": 0.025406504049897194,
	"rewards/probe_terminal_raw/std": 0.10275533050298691,
	"rewards/rollout_reward_func/mean": -0.495505690574646,
	"rewards/rollout_reward_func/std": 0.24283160269260406,
	"sampling/importance_sampling_ratio/max": 2.039003610610962,
	"sampling/importance_sampling_ratio/mean": 1.0263185501098633,
	"sampling/importance_sampling_ratio/min": 0.6725395321846008,
	"sampling/sampling_logp_difference/max": 0.8136651515960693,
	"sampling/sampling_logp_difference/mean": 0.02945869043469429,
	"step": 23,
	"step_time": 27.97098964299971
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.0757814844546374,
	"epoch": 0.00048,
	"grad_norm": 0.02817094884812832,
	"kl": 0.009625433100154623,
	"learning_rate": 5.257142857142857e-06,
	"loss": 0.0001,
	"step": 24,
	"step_time": 11.866423993000353
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.046443949002423324,
	"epoch": 0.0005,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002482979791238904,
	"kl": 0.011937914369631542,
	"learning_rate": 5.485714285714286e-06,
	"loss": -0.0,
	"num_tokens": 1348967.0,
	"reward": 2.4115562438964844,
	"reward_std": 0.4029836654663086,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.987568199634552,
	"rewards/probe_shaping_dominance/std": 0.07032480090856552,
	"rewards/probe_terminal_raw/mean": 0.011941056698560715,
	"rewards/probe_terminal_raw/std": 0.06754881888628006,
	"rewards/rollout_reward_func/mean": -0.4754529595375061,
	"rewards/rollout_reward_func/std": 0.20119507610797882,
	"sampling/importance_sampling_ratio/max": 1.2200837135314941,
	"sampling/importance_sampling_ratio/mean": 0.9975783824920654,
	"sampling/importance_sampling_ratio/min": 0.8279879689216614,
	"sampling/sampling_logp_difference/max": 0.1989191770553589,
	"sampling/sampling_logp_difference/mean": 0.011062754318118095,
	"step": 25,
	"step_time": 26.57660025700011
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.048878253597649746,
	"epoch": 0.00052,
	"grad_norm": 0.009242719039320946,
	"kl": 0.008345632606265863,
	"learning_rate": 5.7142857142857145e-06,
	"loss": -0.0,
	"step": 26,
	"step_time": 11.446816336000438
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.03407000357401557,
	"epoch": 0.00054,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0032159958500415087,
	"kl": 0.0009551170151098631,
	"learning_rate": 5.942857142857143e-06,
	"loss": 0.0001,
	"num_tokens": 1454840.0,
	"reward": 2.308957099914551,
	"reward_std": 0.35809147357940674,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9919047951698303,
	"rewards/probe_shaping_dominance/std": 0.04579342529177666,
	"rewards/probe_terminal_raw/mean": 0.00825711339712143,
	"rewards/probe_terminal_raw/std": 0.04670928418636322,
	"rewards/rollout_reward_func/mean": -0.4849545955657959,
	"rewards/rollout_reward_func/std": 0.17723596096038818,
	"sampling/importance_sampling_ratio/max": 1.3277825117111206,
	"sampling/importance_sampling_ratio/mean": 1.03197181224823,
	"sampling/importance_sampling_ratio/min": 0.9784432053565979,
	"sampling/sampling_logp_difference/max": 0.2835111618041992,
	"sampling/sampling_logp_difference/mean": 0.010589659214019775,
	"step": 27,
	"step_time": 27.828797529999974
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.031250000931322575,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.03843506844714284,
	"epoch": 0.00056,
	"grad_norm": 0.001164909452199936,
	"kl": 0.0005121690442896343,
	"learning_rate": 6.171428571428571e-06,
	"loss": 0.0001,
	"step": 28,
	"step_time": 11.809285704000104
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04930314904777333,
	"epoch": 0.00058,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002748744795098901,
	"kl": 0.004907883932952495,
	"learning_rate": 6.4e-06,
	"loss": -0.0,
	"num_tokens": 1556979.0,
	"reward": 2.240399122238159,
	"reward_std": 0.4602973461151123,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9860905408859253,
	"rewards/probe_shaping_dominance/std": 0.0786839947104454,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5863164663314819,
	"rewards/rollout_reward_func/std": 0.2140309065580368,
	"sampling/importance_sampling_ratio/max": 1.2453359365463257,
	"sampling/importance_sampling_ratio/mean": 0.9654719233512878,
	"sampling/importance_sampling_ratio/min": 0.4166664183139801,
	"sampling/sampling_logp_difference/max": 0.8754727840423584,
	"sampling/sampling_logp_difference/mean": 0.023819994181394577,
	"step": 29,
	"step_time": 26.907328863000203
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.04787830199347809,
	"epoch": 0.0006,
	"grad_norm": 0.004575630649924278,
	"kl": 0.021033072499267114,
	"learning_rate": 6.628571428571428e-06,
	"loss": -0.0,
	"step": 30,
	"step_time": 12.03838489500049
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06083334801951423,
	"epoch": 0.00062,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.013260968960821629,
	"kl": 0.0185297402889546,
	"learning_rate": 6.857142857142856e-06,
	"loss": 0.0001,
	"num_tokens": 1662740.0,
	"reward": 2.1973555088043213,
	"reward_std": 0.43850135803222656,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.991719663143158,
	"rewards/probe_shaping_dominance/std": 0.046840641647577286,
	"rewards/probe_terminal_raw/mean": 0.008384146727621555,
	"rewards/probe_terminal_raw/std": 0.04742789641022682,
	"rewards/rollout_reward_func/mean": -0.5964983701705933,
	"rewards/rollout_reward_func/std": 0.296856164932251,
	"sampling/importance_sampling_ratio/max": 2.8883938789367676,
	"sampling/importance_sampling_ratio/mean": 1.041499376296997,
	"sampling/importance_sampling_ratio/min": 0.611585795879364,
	"sampling/sampling_logp_difference/max": 0.9767682552337646,
	"sampling/sampling_logp_difference/mean": 0.02332986891269684,
	"step": 31,
	"step_time": 27.415389096000126
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.03750000149011612,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.03750000149011612,
	"entropy": 0.06045454426202923,
	"epoch": 0.00064,
	"grad_norm": 0.014426084235310555,
	"kl": 0.027800074360129656,
	"learning_rate": 7.085714285714285e-06,
	"loss": 0.0001,
	"step": 32,
	"step_time": 11.844893076999824
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0422610079695005,
	"epoch": 0.00066,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005128172226250172,
	"kl": 0.009348716392499568,
	"learning_rate": 7.314285714285714e-06,
	"loss": 0.0,
	"num_tokens": 1765521.0,
	"reward": 2.3525331020355225,
	"reward_std": 0.3403870165348053,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9837720394134521,
	"rewards/probe_shaping_dominance/std": 0.06571495532989502,
	"rewards/probe_terminal_raw/mean": 0.01880081370472908,
	"rewards/probe_terminal_raw/std": 0.0745616927742958,
	"rewards/rollout_reward_func/mean": -0.5375398397445679,
	"rewards/rollout_reward_func/std": 0.22309184074401855,
	"sampling/importance_sampling_ratio/max": 1.275700569152832,
	"sampling/importance_sampling_ratio/mean": 0.994273841381073,
	"sampling/importance_sampling_ratio/min": 0.600629448890686,
	"sampling/sampling_logp_difference/max": 0.5097755193710327,
	"sampling/sampling_logp_difference/mean": 0.011872323229908943,
	"step": 33,
	"step_time": 27.277823512999475
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.0630603444587905,
	"epoch": 0.00068,
	"grad_norm": 0.007451063022017479,
	"kl": 0.007260499390742581,
	"learning_rate": 7.542857142857142e-06,
	"loss": 0.0,
	"step": 34,
	"step_time": 12.15706381699988
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 4.0,
	"completions/max_terminated_length": 4.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06165817377041094,
	"epoch": 0.0007,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.01730046235024929,
	"kl": 0.007911830088153327,
	"learning_rate": 7.771428571428572e-06,
	"loss": 0.0,
	"num_tokens": 1868519.0,
	"reward": 2.275172233581543,
	"reward_std": 0.48706814646720886,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.42121174931526184,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9609175324440002,
	"rewards/probe_shaping_dominance/std": 0.12447085976600647,
	"rewards/probe_terminal_raw/mean": 0.042174797505140305,
	"rewards/probe_terminal_raw/std": 0.13503843545913696,
	"rewards/rollout_reward_func/mean": -0.552919864654541,
	"rewards/rollout_reward_func/std": 0.20079734921455383,
	"sampling/importance_sampling_ratio/max": 2.4695143699645996,
	"sampling/importance_sampling_ratio/mean": 1.0170851945877075,
	"sampling/importance_sampling_ratio/min": 0.5358201861381531,
	"sampling/sampling_logp_difference/max": 0.9040230512619019,
	"sampling/sampling_logp_difference/mean": 0.023447973653674126,
	"step": 35,
	"step_time": 26.740296546999843
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.058829510177019984,
	"epoch": 0.00072,
	"grad_norm": 0.0026921494863927364,
	"kl": 0.008077224918185522,
	"learning_rate": 8e-06,
	"loss": 0.0,
	"step": 36,
	"step_time": 11.526741372999822
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0314667156167161,
	"epoch": 0.00074,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0024028760381042957,
	"kl": 0.00625098004627489,
	"learning_rate": 7.999999998518522e-06,
	"loss": -0.0,
	"num_tokens": 1970124.0,
	"reward": 2.264838933944702,
	"reward_std": 0.5270799994468689,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.420013427734375,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9424034953117371,
	"rewards/probe_shaping_dominance/std": 0.13680323958396912,
	"rewards/probe_terminal_raw/mean": 0.05856199190020561,
	"rewards/probe_terminal_raw/std": 0.1405627578496933,
	"rewards/rollout_reward_func/mean": -0.4673765003681183,
	"rewards/rollout_reward_func/std": 0.2097388207912445,
	"sampling/importance_sampling_ratio/max": 1.8680520057678223,
	"sampling/importance_sampling_ratio/mean": 1.0426936149597168,
	"sampling/importance_sampling_ratio/min": 0.9883837103843689,
	"sampling/sampling_logp_difference/max": 0.6248946189880371,
	"sampling/sampling_logp_difference/mean": 0.012692131102085114,
	"step": 37,
	"step_time": 26.3523716899997
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.0313223133775864,
	"epoch": 0.00076,
	"grad_norm": 0.0023324843496084213,
	"kl": 0.0035868614445746516,
	"learning_rate": 7.99999999407409e-06,
	"loss": -0.0,
	"step": 38,
	"step_time": 12.628685679
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05816701124422252,
	"epoch": 0.00078,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007494654040783644,
	"kl": 0.03421914212867705,
	"learning_rate": 7.999999986666703e-06,
	"loss": -0.0,
	"num_tokens": 2076598.0,
	"reward": 2.311230182647705,
	"reward_std": 0.36618658900260925,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9932495951652527,
	"rewards/probe_shaping_dominance/std": 0.03818599134683609,
	"rewards/probe_terminal_raw/mean": 0.00889227632433176,
	"rewards/probe_terminal_raw/std": 0.05030231550335884,
	"rewards/rollout_reward_func/mean": -0.6096617579460144,
	"rewards/rollout_reward_func/std": 0.20722205936908722,
	"sampling/importance_sampling_ratio/max": 1.4155004024505615,
	"sampling/importance_sampling_ratio/mean": 0.9876462817192078,
	"sampling/importance_sampling_ratio/min": 0.7839126586914062,
	"sampling/sampling_logp_difference/max": 0.3471514582633972,
	"sampling/sampling_logp_difference/mean": 0.0168665312230587,
	"step": 39,
	"step_time": 26.542540336999764
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.057803097704891115,
	"epoch": 0.0008,
	"grad_norm": 0.004047502297908068,
	"kl": 0.02604524488651805,
	"learning_rate": 7.99999997629636e-06,
	"loss": -0.0,
	"step": 40,
	"step_time": 11.67055183600064
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.030615816707722843,
	"epoch": 0.00082,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002531230915337801,
	"kl": 0.0002023791248291218,
	"learning_rate": 7.999999962963062e-06,
	"loss": 0.0,
	"num_tokens": 2182025.0,
	"reward": 2.3659095764160156,
	"reward_std": 0.3363305926322937,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.970511794090271,
	"rewards/probe_shaping_dominance/std": 0.11608950048685074,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.49210208654403687,
	"rewards/rollout_reward_func/std": 0.19491301476955414,
	"sampling/importance_sampling_ratio/max": 1.0795150995254517,
	"sampling/importance_sampling_ratio/mean": 1.0009956359863281,
	"sampling/importance_sampling_ratio/min": 0.9117990136146545,
	"sampling/sampling_logp_difference/max": 0.09234827756881714,
	"sampling/sampling_logp_difference/mean": 0.004786844830960035,
	"step": 41,
	"step_time": 26.691086626000242
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.029701224237214774,
	"epoch": 0.00084,
	"grad_norm": 0.0024189443793147802,
	"kl": 0.0002964178702313802,
	"learning_rate": 7.999999946666809e-06,
	"loss": 0.0,
	"step": 42,
	"step_time": 12.699684607000108
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 4.0,
	"completions/max_terminated_length": 4.0,
	"completions/mean_length": 3.0,
	"completions/mean_terminated_length": 3.0,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05056236406426251,
	"epoch": 0.00086,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.01209098007529974,
	"kl": 0.010471812368450628,
	"learning_rate": 7.999999927407602e-06,
	"loss": -0.0,
	"num_tokens": 2286142.0,
	"reward": 2.469311237335205,
	"reward_std": 0.4115804135799408,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 2.0,
	"rewards/probe_completion_length/std": 0.2540002465248108,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9852168560028076,
	"rewards/probe_shaping_dominance/std": 0.0836259201169014,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.48153066635131836,
	"rewards/rollout_reward_func/std": 0.24715669453144073,
	"sampling/importance_sampling_ratio/max": 2.0913164615631104,
	"sampling/importance_sampling_ratio/mean": 1.0417256355285645,
	"sampling/importance_sampling_ratio/min": 0.8711547255516052,
	"sampling/sampling_logp_difference/max": 0.7377924919128418,
	"sampling/sampling_logp_difference/mean": 0.016645925119519234,
	"step": 43,
	"step_time": 26.97034319699992
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.06137717463207082,
	"epoch": 0.00088,
	"grad_norm": 0.004214904736727476,
	"kl": 0.02022934940032428,
	"learning_rate": 7.99999990518544e-06,
	"loss": -0.0,
	"step": 44,
	"step_time": 11.70073124500027
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.015625,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.028125000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0973230431554839,
	"epoch": 0.0009,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008132712915539742,
	"kl": 0.012426901788174405,
	"learning_rate": 7.999999880000322e-06,
	"loss": 0.0,
	"num_tokens": 2390804.0,
	"reward": 2.2431583404541016,
	"reward_std": 0.5248546600341797,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.420013427734375,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9669345021247864,
	"rewards/probe_shaping_dominance/std": 0.10819793492555618,
	"rewards/probe_terminal_raw/mean": 0.038998983800411224,
	"rewards/probe_terminal_raw/std": 0.1286177635192871,
	"rewards/rollout_reward_func/mean": -0.4940252900123596,
	"rewards/rollout_reward_func/std": 0.255024790763855,
	"sampling/importance_sampling_ratio/max": 1.6163866519927979,
	"sampling/importance_sampling_ratio/mean": 0.9977768659591675,
	"sampling/importance_sampling_ratio/min": 0.3879617154598236,
	"sampling/sampling_logp_difference/max": 0.9467527270317078,
	"sampling/sampling_logp_difference/mean": 0.02932477556169033,
	"step": 45,
	"step_time": 26.472695325999894
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.03125,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.043750000186264515,
	"entropy": 0.09132259455509484,
	"epoch": 0.00092,
	"grad_norm": 0.004103749990463257,
	"kl": 0.02156046110090415,
	"learning_rate": 7.99999985185225e-06,
	"loss": 0.0,
	"step": 46,
	"step_time": 12.17020241299997
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08302483463194221,
	"epoch": 0.00094,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00962382648140192,
	"kl": 0.05296483388110573,
	"learning_rate": 7.999999820741223e-06,
	"loss": 0.0,
	"num_tokens": 2498950.0,
	"reward": 2.3484296798706055,
	"reward_std": 0.40232396125793457,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9905115365982056,
	"rewards/probe_shaping_dominance/std": 0.05367483198642731,
	"rewards/probe_terminal_raw/mean": 0.009019308723509312,
	"rewards/probe_terminal_raw/std": 0.05102091282606125,
	"rewards/rollout_reward_func/mean": -0.507351279258728,
	"rewards/rollout_reward_func/std": 0.22662682831287384,
	"sampling/importance_sampling_ratio/max": 1.3692384958267212,
	"sampling/importance_sampling_ratio/mean": 0.9901071786880493,
	"sampling/importance_sampling_ratio/min": 0.3076327443122864,
	"sampling/sampling_logp_difference/max": 1.179471731185913,
	"sampling/sampling_logp_difference/mean": 0.03242562711238861,
	"step": 47,
	"step_time": 26.895124169999463
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.07248554454417899,
	"epoch": 0.00096,
	"grad_norm": 0.01555224135518074,
	"kl": 0.039988372170228104,
	"learning_rate": 7.99999978666724e-06,
	"loss": -0.0,
	"step": 48,
	"step_time": 11.803917615999808
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06659889499132987,
	"epoch": 0.00098,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007047319784760475,
	"kl": 0.038143942947499454,
	"learning_rate": 7.999999749630303e-06,
	"loss": 0.0001,
	"num_tokens": 2605752.0,
	"reward": 2.304872512817383,
	"reward_std": 0.4004109501838684,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5201276540756226,
	"rewards/rollout_reward_func/std": 0.2584696114063263,
	"sampling/importance_sampling_ratio/max": 2.615042209625244,
	"sampling/importance_sampling_ratio/mean": 1.0269113779067993,
	"sampling/importance_sampling_ratio/min": 0.39808669686317444,
	"sampling/sampling_logp_difference/max": 0.9612793922424316,
	"sampling/sampling_logp_difference/mean": 0.03832431882619858,
	"step": 49,
	"step_time": 26.91781551100007
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.0618492451030761,
	"epoch": 0.001,
	"grad_norm": 0.00791104231029749,
	"kl": 0.05557279207035515,
	"learning_rate": 7.999999709630412e-06,
	"loss": 0.0001,
	"step": 50,
	"step_time": 12.788009578999208
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05631835470558144,
	"epoch": 0.00102,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0038132003974169493,
	"kl": 0.029594353904632498,
	"learning_rate": 7.999999666667564e-06,
	"loss": 0.0,
	"num_tokens": 2707257.0,
	"reward": 2.346804618835449,
	"reward_std": 0.2936249077320099,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.571945309638977,
	"rewards/rollout_reward_func/std": 0.23333650827407837,
	"sampling/importance_sampling_ratio/max": 1.6730494499206543,
	"sampling/importance_sampling_ratio/mean": 0.9981693029403687,
	"sampling/importance_sampling_ratio/min": 0.40917959809303284,
	"sampling/sampling_logp_difference/max": 0.9063196182250977,
	"sampling/sampling_logp_difference/mean": 0.024803204461932182,
	"step": 51,
	"step_time": 26.73268852599972
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.05670045691658743,
	"epoch": 0.00104,
	"grad_norm": 0.003768681548535824,
	"kl": 0.030258090482694455,
	"learning_rate": 7.999999620741765e-06,
	"loss": 0.0,
	"step": 52,
	"step_time": 11.579914525999584
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1101932916790247,
	"epoch": 0.00106,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0030547163914889097,
	"kl": 0.01951221001081649,
	"learning_rate": 7.999999571853009e-06,
	"loss": 0.0,
	"num_tokens": 2811393.0,
	"reward": 2.1927480697631836,
	"reward_std": 0.406143456697464,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9884125590324402,
	"rewards/probe_shaping_dominance/std": 0.0655483826994896,
	"rewards/probe_terminal_raw/mean": 0.01143292710185051,
	"rewards/probe_terminal_raw/std": 0.06467439979314804,
	"rewards/rollout_reward_func/mean": -0.5695973038673401,
	"rewards/rollout_reward_func/std": 0.16589799523353577,
	"sampling/importance_sampling_ratio/max": 1.0527032613754272,
	"sampling/importance_sampling_ratio/mean": 0.9693626165390015,
	"sampling/importance_sampling_ratio/min": 0.5484977960586548,
	"sampling/sampling_logp_difference/max": 0.6245040893554688,
	"sampling/sampling_logp_difference/mean": 0.023702893406152725,
	"step": 53,
	"step_time": 27.233809398999938
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.11004186974605545,
	"epoch": 0.00108,
	"grad_norm": 0.006082055624574423,
	"kl": 0.04293493747854882,
	"learning_rate": 7.999999520001299e-06,
	"loss": 0.0,
	"step": 54,
	"step_time": 12.14583877500013
	},
	{
	"clip_ratio/high_max": 0.05208333395421505,
	"clip_ratio/high_mean": 0.026041666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.026041666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0866635709971888,
	"epoch": 0.0011,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005786150228232145,
	"kl": 0.045153988463084715,
	"learning_rate": 7.999999465186634e-06,
	"loss": 0.0,
	"num_tokens": 2914367.0,
	"reward": 2.3385372161865234,
	"reward_std": 0.3273521363735199,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5177128314971924,
	"rewards/rollout_reward_func/std": 0.2579730451107025,
	"sampling/importance_sampling_ratio/max": 1.2267568111419678,
	"sampling/importance_sampling_ratio/mean": 0.9484584331512451,
	"sampling/importance_sampling_ratio/min": 0.5135900378227234,
	"sampling/sampling_logp_difference/max": 0.6663306355476379,
	"sampling/sampling_logp_difference/mean": 0.0320717915892601,
	"step": 55,
	"step_time": 26.36462075400027
	},
	{
	"clip_ratio/high_max": 0.0729166679084301,
	"clip_ratio/high_mean": 0.046875000931322575,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.046875000931322575,
	"entropy": 0.09378209740680177,
	"epoch": 0.00112,
	"grad_norm": 0.007270520552992821,
	"kl": 0.05788560025212064,
	"learning_rate": 7.999999407409014e-06,
	"loss": 0.0,
	"step": 56,
	"step_time": 11.583988187999921
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08144025912042707,
	"epoch": 0.00114,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006977744400501251,
	"kl": 0.16513798182256778,
	"learning_rate": 7.99999934666844e-06,
	"loss": -0.0,
	"num_tokens": 3018848.0,
	"reward": 2.2243924140930176,
	"reward_std": 0.4345919191837311,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9774075150489807,
	"rewards/probe_shaping_dominance/std": 0.08980447798967361,
	"rewards/probe_terminal_raw/mean": 0.02489837259054184,
	"rewards/probe_terminal_raw/std": 0.1013173907995224,
	"rewards/rollout_reward_func/mean": -0.540413498878479,
	"rewards/rollout_reward_func/std": 0.20110559463500977,
	"sampling/importance_sampling_ratio/max": 2.1173288822174072,
	"sampling/importance_sampling_ratio/mean": 1.0253949165344238,
	"sampling/importance_sampling_ratio/min": 0.34861743450164795,
	"sampling/sampling_logp_difference/max": 1.0653817653656006,
	"sampling/sampling_logp_difference/mean": 0.03663061559200287,
	"step": 57,
	"step_time": 27.63207101699959
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.08666177466511726,
	"epoch": 0.00116,
	"grad_norm": 0.00648898771032691,
	"kl": 0.14551325980573893,
	"learning_rate": 7.999999282964912e-06,
	"loss": 0.0,
	"step": 58,
	"step_time": 12.149218646000236
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0776638601673767,
	"epoch": 0.00118,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006341388914734125,
	"kl": 0.1438233179026156,
	"learning_rate": 7.999999216298429e-06,
	"loss": 0.0,
	"num_tokens": 3118313.0,
	"reward": 2.337385654449463,
	"reward_std": 0.40537285804748535,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9689397215843201,
	"rewards/probe_shaping_dominance/std": 0.09820227324962616,
	"rewards/probe_terminal_raw/mean": 0.03201219439506531,
	"rewards/probe_terminal_raw/std": 0.10123317688703537,
	"rewards/rollout_reward_func/mean": -0.5198163986206055,
	"rewards/rollout_reward_func/std": 0.24933888018131256,
	"sampling/importance_sampling_ratio/max": 1.642152190208435,
	"sampling/importance_sampling_ratio/mean": 0.9745345115661621,
	"sampling/importance_sampling_ratio/min": 0.32652705907821655,
	"sampling/sampling_logp_difference/max": 1.1220024824142456,
	"sampling/sampling_logp_difference/mean": 0.04093600809574127,
	"step": 59,
	"step_time": 26.148361385999806
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.08309375832322985,
	"epoch": 0.0012,
	"grad_norm": 0.009624861180782318,
	"kl": 0.15202067893005733,
	"learning_rate": 7.999999146668991e-06,
	"loss": 0.0,
	"step": 60,
	"step_time": 11.512923075000117
	},
	{
	"clip_ratio/high_max": 0.07083333469927311,
	"clip_ratio/high_mean": 0.035416667349636555,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.056250001303851604,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10362166631966829,
	"epoch": 0.00122,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.012691067531704903,
	"kl": 0.22026659833500162,
	"learning_rate": 7.999999074076601e-06,
	"loss": 0.0001,
	"num_tokens": 3227556.0,
	"reward": 2.3282229900360107,
	"reward_std": 0.4200522303581238,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.972678005695343,
	"rewards/probe_shaping_dominance/std": 0.10808944702148438,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.4694550037384033,
	"rewards/rollout_reward_func/std": 0.2165255844593048,
	"sampling/importance_sampling_ratio/max": 1.6590189933776855,
	"sampling/importance_sampling_ratio/mean": 0.9916884899139404,
	"sampling/importance_sampling_ratio/min": 0.47236600518226624,
	"sampling/sampling_logp_difference/max": 0.7500003576278687,
	"sampling/sampling_logp_difference/mean": 0.045740097761154175,
	"step": 61,
	"step_time": 28.188819883999713
	},
	{
	"clip_ratio/high_max": 0.07083333469927311,
	"clip_ratio/high_mean": 0.035416667349636555,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04583333432674408,
	"entropy": 0.10312735941261053,
	"epoch": 0.00124,
	"grad_norm": 0.019286708906292915,
	"kl": 0.11081840936094522,
	"learning_rate": 7.999998998521257e-06,
	"loss": 0.0001,
	"step": 62,
	"step_time": 11.837706676999915
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08664211053110193,
	"epoch": 0.00126,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.015839533880352974,
	"kl": 0.10350155318124621,
	"learning_rate": 7.999998920002956e-06,
	"loss": -0.0,
	"num_tokens": 3332394.0,
	"reward": 2.405167579650879,
	"reward_std": 0.46130281686782837,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9712571501731873,
	"rewards/probe_shaping_dominance/std": 0.11318810284137726,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.48483964800834656,
	"rewards/rollout_reward_func/std": 0.24800339341163635,
	"sampling/importance_sampling_ratio/max": 1.9499810934066772,
	"sampling/importance_sampling_ratio/mean": 0.9958123564720154,
	"sampling/importance_sampling_ratio/min": 0.30673947930336,
	"sampling/sampling_logp_difference/max": 0.8753989338874817,
	"sampling/sampling_logp_difference/mean": 0.03312094882130623,
	"step": 63,
	"step_time": 26.684526995999477
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.08623273627017625,
	"epoch": 0.00128,
	"grad_norm": 0.022980431094765663,
	"kl": 0.11929617358450173,
	"learning_rate": 7.999998838521705e-06,
	"loss": -0.0,
	"step": 64,
	"step_time": 12.258063536000009
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07359768182504922,
	"epoch": 0.0013,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.011483771726489067,
	"kl": 0.10457528214246281,
	"learning_rate": 7.999998754077496e-06,
	"loss": -0.0,
	"num_tokens": 3436726.0,
	"reward": 2.377361297607422,
	"reward_std": 0.5483381748199463,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.41638875007629395,
	"rewards/rollout_reward_func/std": 0.2915210723876953,
	"sampling/importance_sampling_ratio/max": 1.1877729892730713,
	"sampling/importance_sampling_ratio/mean": 0.9874942898750305,
	"sampling/importance_sampling_ratio/min": 0.26991596817970276,
	"sampling/sampling_logp_difference/max": 1.309645414352417,
	"sampling/sampling_logp_difference/mean": 0.027806004509329796,
	"step": 65,
	"step_time": 27.115505474999736
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.06954771315213293,
	"epoch": 0.00132,
	"grad_norm": 0.011225158348679543,
	"kl": 0.4594924821127222,
	"learning_rate": 7.999998666670336e-06,
	"loss": -0.0,
	"step": 66,
	"step_time": 11.664916763999372
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05904226377606392,
	"epoch": 0.00134,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.012288263067603111,
	"kl": 0.0946728276903741,
	"learning_rate": 7.999998576300222e-06,
	"loss": -0.0,
	"num_tokens": 3541291.0,
	"reward": 2.2826719284057617,
	"reward_std": 0.36464667320251465,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9738304615020752,
	"rewards/probe_shaping_dominance/std": 0.0877876877784729,
	"rewards/probe_terminal_raw/mean": 0.03137703239917755,
	"rewards/probe_terminal_raw/std": 0.10557617992162704,
	"rewards/rollout_reward_func/mean": -0.6100356578826904,
	"rewards/rollout_reward_func/std": 0.23593732714653015,
	"sampling/importance_sampling_ratio/max": 1.271332859992981,
	"sampling/importance_sampling_ratio/mean": 0.9844968914985657,
	"sampling/importance_sampling_ratio/min": 0.3530118763446808,
	"sampling/sampling_logp_difference/max": 1.0369465351104736,
	"sampling/sampling_logp_difference/mean": 0.02148618921637535,
	"step": 67,
	"step_time": 26.421768857000643
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.06591829867102206,
	"epoch": 0.00136,
	"grad_norm": 0.01136076170951128,
	"kl": 0.09406092630524654,
	"learning_rate": 7.999998482967154e-06,
	"loss": -0.0,
	"step": 68,
	"step_time": 12.272947167999973
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09870199719443917,
	"epoch": 0.00138,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.017969369888305664,
	"kl": 0.16196376640436938,
	"learning_rate": 7.999998386671134e-06,
	"loss": 0.0,
	"num_tokens": 3645068.0,
	"reward": 2.2971627712249756,
	"reward_std": 0.3776472806930542,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9890751838684082,
	"rewards/probe_shaping_dominance/std": 0.06179998442530632,
	"rewards/probe_terminal_raw/mean": 0.01092479657381773,
	"rewards/probe_terminal_raw/std": 0.06179998070001602,
	"rewards/rollout_reward_func/mean": -0.5590872764587402,
	"rewards/rollout_reward_func/std": 0.19611209630966187,
	"sampling/importance_sampling_ratio/max": 2.4048268795013428,
	"sampling/importance_sampling_ratio/mean": 0.9662601947784424,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 1.3840640783309937,
	"sampling/sampling_logp_difference/mean": 0.0624161995947361,
	"step": 69,
	"step_time": 26.791781901999457
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.1055372767150402,
	"epoch": 0.0014,
	"grad_norm": 0.006739933043718338,
	"kl": 0.17029937845654786,
	"learning_rate": 7.999998287412158e-06,
	"loss": 0.0,
	"step": 70,
	"step_time": 11.527228552999532
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0416666679084301,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0755673204548657,
	"epoch": 0.00142,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0038206640165299177,
	"kl": 0.27058742146891746,
	"learning_rate": 7.99999818519023e-06,
	"loss": -0.0,
	"num_tokens": 3745050.0,
	"reward": 2.4418420791625977,
	"reward_std": 0.3276258409023285,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9443497061729431,
	"rewards/probe_shaping_dominance/std": 0.17115506529808044,
	"rewards/probe_terminal_raw/mean": 0.05487804859876633,
	"rewards/probe_terminal_raw/std": 0.15910547971725464,
	"rewards/rollout_reward_func/mean": -0.4761357307434082,
	"rewards/rollout_reward_func/std": 0.27386248111724854,
	"sampling/importance_sampling_ratio/max": 1.2027363777160645,
	"sampling/importance_sampling_ratio/mean": 0.9526693224906921,
	"sampling/importance_sampling_ratio/min": 0.26859819889068604,
	"sampling/sampling_logp_difference/max": 1.314541220664978,
	"sampling/sampling_logp_difference/mean": 0.04236820340156555,
	"step": 71,
	"step_time": 25.810139078000248
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.07833350286819041,
	"epoch": 0.00144,
	"grad_norm": 0.006155087612569332,
	"kl": 0.15766439647995867,
	"learning_rate": 7.999998080005348e-06,
	"loss": -0.0,
	"step": 72,
	"step_time": 11.807300304999444
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 4.0,
	"completions/max_terminated_length": 4.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.12527845823206007,
	"epoch": 0.00146,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.012825227342545986,
	"kl": 0.4211071440950036,
	"learning_rate": 7.999997971857512e-06,
	"loss": 0.0001,
	"num_tokens": 3846778.0,
	"reward": 2.290764570236206,
	"reward_std": 0.5837900042533875,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.4908435642719269,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9892492890357971,
	"rewards/probe_shaping_dominance/std": 0.06081530824303627,
	"rewards/probe_terminal_raw/mean": 0.010797764174640179,
	"rewards/probe_terminal_raw/std": 0.06108137592673302,
	"rewards/rollout_reward_func/mean": -0.4405323565006256,
	"rewards/rollout_reward_func/std": 0.3242381811141968,
	"sampling/importance_sampling_ratio/max": 1.6338335275650024,
	"sampling/importance_sampling_ratio/mean": 0.9540376663208008,
	"sampling/importance_sampling_ratio/min": 0.19394879043102264,
	"sampling/sampling_logp_difference/max": 1.26481294631958,
	"sampling/sampling_logp_difference/mean": 0.07170334458351135,
	"step": 73,
	"step_time": 27.727274773000772
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.035416667349636555,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.035416667349636555,
	"entropy": 0.1369485834147781,
	"epoch": 0.00148,
	"grad_norm": 0.006000218912959099,
	"kl": 0.3834730681264773,
	"learning_rate": 7.999997860746726e-06,
	"loss": 0.0,
	"step": 74,
	"step_time": 11.550198297999486
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05670425167772919,
	"epoch": 0.0015,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004246127791702747,
	"kl": 0.26258886672280823,
	"learning_rate": 7.999997746672985e-06,
	"loss": 0.0001,
	"num_tokens": 3952684.0,
	"reward": 2.3076558113098145,
	"reward_std": 0.2708474397659302,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5798441171646118,
	"rewards/rollout_reward_func/std": 0.21061494946479797,
	"sampling/importance_sampling_ratio/max": 1.4762965440750122,
	"sampling/importance_sampling_ratio/mean": 0.9765973091125488,
	"sampling/importance_sampling_ratio/min": 0.1482001394033432,
	"sampling/sampling_logp_difference/max": 1.9091930389404297,
	"sampling/sampling_logp_difference/mean": 0.034642815589904785,
	"step": 75,
	"step_time": 27.424144634000186
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.06237753387540579,
	"epoch": 0.00152,
	"grad_norm": 0.005785573739558458,
	"kl": 0.34405436088127317,
	"learning_rate": 7.999997629636291e-06,
	"loss": 0.0001,
	"step": 76,
	"step_time": 12.303879873000824
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08415639377199113,
	"epoch": 0.00154,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005243807099759579,
	"kl": 0.17415540551155573,
	"learning_rate": 7.999997509636644e-06,
	"loss": 0.0,
	"num_tokens": 4058589.0,
	"reward": 2.46805739402771,
	"reward_std": 0.32934877276420593,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9650155901908875,
	"rewards/probe_shaping_dominance/std": 0.11404264718294144,
	"rewards/probe_terminal_raw/mean": 0.04090446978807449,
	"rewards/probe_terminal_raw/std": 0.13221491873264313,
	"rewards/rollout_reward_func/mean": -0.45661279559135437,
	"rewards/rollout_reward_func/std": 0.2438260018825531,
	"sampling/importance_sampling_ratio/max": 1.467045783996582,
	"sampling/importance_sampling_ratio/mean": 0.9993070363998413,
	"sampling/importance_sampling_ratio/min": 0.5919517874717712,
	"sampling/sampling_logp_difference/max": 0.5126774311065674,
	"sampling/sampling_logp_difference/mean": 0.021975167095661163,
	"step": 77,
	"step_time": 27.026433300999997
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.08070441009476781,
	"epoch": 0.00156,
	"grad_norm": 0.0065447925589978695,
	"kl": 0.1744868414461962,
	"learning_rate": 7.999997386674047e-06,
	"loss": 0.0,
	"step": 78,
	"step_time": 11.744910646999415
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07671235466841608,
	"epoch": 0.00158,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007739327382296324,
	"kl": 0.10829602145804529,
	"learning_rate": 7.999997260748495e-06,
	"loss": 0.0,
	"num_tokens": 4163362.0,
	"reward": 2.291594982147217,
	"reward_std": 0.39855584502220154,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9897778034210205,
	"rewards/probe_shaping_dominance/std": 0.05782533064484596,
	"rewards/probe_terminal_raw/mean": 0.009908536449074745,
	"rewards/probe_terminal_raw/std": 0.05605114996433258,
	"rewards/rollout_reward_func/mean": -0.5330914855003357,
	"rewards/rollout_reward_func/std": 0.2664976716041565,
	"sampling/importance_sampling_ratio/max": 1.3343223333358765,
	"sampling/importance_sampling_ratio/mean": 0.9947078227996826,
	"sampling/importance_sampling_ratio/min": 0.4244631230831146,
	"sampling/sampling_logp_difference/max": 0.9074487686157227,
	"sampling/sampling_logp_difference/mean": 0.022345466539263725,
	"step": 79,
	"step_time": 27.107436816999325
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.08035149308852851,
	"epoch": 0.0016,
	"grad_norm": 0.00506787933409214,
	"kl": 0.1221858259250439,
	"learning_rate": 7.999997131859992e-06,
	"loss": 0.0,
	"step": 80,
	"step_time": 12.165714977000334
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.031250000931322575,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04375000111758709,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1357073881663382,
	"epoch": 0.00162,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008707523345947266,
	"kl": 0.19407588429749012,
	"learning_rate": 7.999997000008536e-06,
	"loss": 0.0,
	"num_tokens": 4264863.0,
	"reward": 2.4384140968322754,
	"reward_std": 0.4922390580177307,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.963716983795166,
	"rewards/probe_shaping_dominance/std": 0.11659354716539383,
	"rewards/probe_terminal_raw/mean": 0.03658536449074745,
	"rewards/probe_terminal_raw/std": 0.11809173226356506,
	"rewards/rollout_reward_func/mean": -0.44938817620277405,
	"rewards/rollout_reward_func/std": 0.28418225049972534,
	"sampling/importance_sampling_ratio/max": 1.7522894144058228,
	"sampling/importance_sampling_ratio/mean": 0.9879751205444336,
	"sampling/importance_sampling_ratio/min": 0.4941127300262451,
	"sampling/sampling_logp_difference/max": 0.5609221458435059,
	"sampling/sampling_logp_difference/mean": 0.03759397938847542,
	"step": 81,
	"step_time": 26.34822328099972
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.14159703021869063,
	"epoch": 0.00164,
	"grad_norm": 0.009574824012815952,
	"kl": 0.1771204932992987,
	"learning_rate": 7.999996865194129e-06,
	"loss": 0.0,
	"step": 82,
	"step_time": 11.777719495999463
	},
	{
	"clip_ratio/high_max": 0.06250000186264515,
	"clip_ratio/high_mean": 0.031250000931322575,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04375000111758709,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11876969272270799,
	"epoch": 0.00166,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010034332983195782,
	"kl": 0.36267855847108876,
	"learning_rate": 7.99999672741677e-06,
	"loss": 0.0001,
	"num_tokens": 4371298.0,
	"reward": 2.316115379333496,
	"reward_std": 0.4054742753505707,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9912324547767639,
	"rewards/probe_shaping_dominance/std": 0.049596767872571945,
	"rewards/probe_terminal_raw/mean": 0.009273373521864414,
	"rewards/probe_terminal_raw/std": 0.05245811864733696,
	"rewards/rollout_reward_func/mean": -0.5093902349472046,
	"rewards/rollout_reward_func/std": 0.24608401954174042,
	"sampling/importance_sampling_ratio/max": 1.394594430923462,
	"sampling/importance_sampling_ratio/mean": 0.9233759045600891,
	"sampling/importance_sampling_ratio/min": 0.08404743671417236,
	"sampling/sampling_logp_difference/max": 2.4710586071014404,
	"sampling/sampling_logp_difference/mean": 0.07214178144931793,
	"step": 83,
	"step_time": 27.42874688900065
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.11967162042856216,
	"epoch": 0.00168,
	"grad_norm": 0.009677170775830746,
	"kl": 0.30461428755370434,
	"learning_rate": 7.999996586676458e-06,
	"loss": 0.0001,
	"step": 84,
	"step_time": 12.210796541999116
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08633493585512042,
	"epoch": 0.0017,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.009309964254498482,
	"kl": 0.34726120328798515,
	"learning_rate": 7.999996442973193e-06,
	"loss": -0.0,
	"num_tokens": 4476938.0,
	"reward": 2.3256678581237793,
	"reward_std": 0.3970645070075989,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5930821895599365,
	"rewards/rollout_reward_func/std": 0.20994225144386292,
	"sampling/importance_sampling_ratio/max": 2.7198355197906494,
	"sampling/importance_sampling_ratio/mean": 0.965837836265564,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 2.636561870574951,
	"sampling/sampling_logp_difference/mean": 0.07213791459798813,
	"step": 85,
	"step_time": 26.77135907899992
	},
	{
	"clip_ratio/high_max": 0.06250000186264515,
	"clip_ratio/high_mean": 0.031250000931322575,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0416666679084301,
	"entropy": 0.08549185702577233,
	"epoch": 0.00172,
	"grad_norm": 0.00986558198928833,
	"kl": 0.6476581503327452,
	"learning_rate": 7.99999629630698e-06,
	"loss": -0.0,
	"step": 86,
	"step_time": 11.659285754999019
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08913910732371733,
	"epoch": 0.00174,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005745335482060909,
	"kl": 0.21945283197192111,
	"learning_rate": 7.999996146677813e-06,
	"loss": -0.0001,
	"num_tokens": 4579856.0,
	"reward": 2.2342212200164795,
	"reward_std": 0.5761978030204773,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.420013427734375,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.985537052154541,
	"rewards/probe_shaping_dominance/std": 0.08181492984294891,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.4981907904148102,
	"rewards/rollout_reward_func/std": 0.2684464752674103,
	"sampling/importance_sampling_ratio/max": 1.1302220821380615,
	"sampling/importance_sampling_ratio/mean": 0.9439641833305359,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 1.921440839767456,
	"sampling/sampling_logp_difference/mean": 0.047181740403175354,
	"step": 87,
	"step_time": 27.09005630599995
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.08115326758706942,
	"epoch": 0.00176,
	"grad_norm": 0.003665071912109852,
	"kl": 0.22057799324602456,
	"learning_rate": 7.999995994085696e-06,
	"loss": -0.0001,
	"step": 88,
	"step_time": 12.136771756998769
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07007716363295913,
	"epoch": 0.00178,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007810859940946102,
	"kl": 0.6949258089686055,
	"learning_rate": 7.999995838530628e-06,
	"loss": -0.0,
	"num_tokens": 4685612.0,
	"reward": 2.3873391151428223,
	"reward_std": 0.4150564968585968,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5001606941223145,
	"rewards/rollout_reward_func/std": 0.2632400095462799,
	"sampling/importance_sampling_ratio/max": 1.329830527305603,
	"sampling/importance_sampling_ratio/mean": 0.9396188259124756,
	"sampling/importance_sampling_ratio/min": 0.09286217391490936,
	"sampling/sampling_logp_difference/max": 2.376638174057007,
	"sampling/sampling_logp_difference/mean": 0.05502761900424957,
	"step": 89,
	"step_time": 26.554008219000025
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.07465687586227432,
	"epoch": 0.0018,
	"grad_norm": 0.009502755478024483,
	"kl": 0.22063382680062205,
	"learning_rate": 7.99999568001261e-06,
	"loss": -0.0,
	"step": 90,
	"step_time": 12.219043876999876
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.14270146866329014,
	"epoch": 0.00182,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008744009770452976,
	"kl": 0.11013963767254609,
	"learning_rate": 7.999995518531638e-06,
	"loss": -0.0001,
	"num_tokens": 4789951.0,
	"reward": 2.567716360092163,
	"reward_std": 0.9114633798599243,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 2.125,
	"rewards/probe_completion_length/std": 0.9069623351097107,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9740893840789795,
	"rewards/probe_shaping_dominance/std": 0.10259100794792175,
	"rewards/probe_terminal_raw/mean": 0.02515243925154209,
	"rewards/probe_terminal_raw/std": 0.10202876478433609,
	"rewards/rollout_reward_func/mean": -0.5065252184867859,
	"rewards/rollout_reward_func/std": 0.20758704841136932,
	"sampling/importance_sampling_ratio/max": 1.6487281322479248,
	"sampling/importance_sampling_ratio/mean": 0.9680857062339783,
	"sampling/importance_sampling_ratio/min": 0.3606947958469391,
	"sampling/sampling_logp_difference/max": 0.7544957399368286,
	"sampling/sampling_logp_difference/mean": 0.04080694913864136,
	"step": 91,
	"step_time": 26.54145688799963
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.04375000111758709,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.054166668094694614,
	"entropy": 0.1536610189359635,
	"epoch": 0.00184,
	"grad_norm": 0.0049968562088906765,
	"kl": 0.21468755277851415,
	"learning_rate": 7.999995354087718e-06,
	"loss": -0.0001,
	"step": 92,
	"step_time": 12.239923568000904
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09390545927453786,
	"epoch": 0.00186,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00847246777266264,
	"kl": 0.4723499550793804,
	"learning_rate": 7.999995186680847e-06,
	"loss": -0.0,
	"num_tokens": 4891817.0,
	"reward": 2.240363121032715,
	"reward_std": 0.4286558926105499,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9915216565132141,
	"rewards/probe_shaping_dominance/std": 0.04796085134148598,
	"rewards/probe_terminal_raw/mean": 0.008511179126799107,
	"rewards/probe_terminal_raw/std": 0.04814650118350983,
	"rewards/rollout_reward_func/mean": -0.5221695899963379,
	"rewards/rollout_reward_func/std": 0.18585550785064697,
	"sampling/importance_sampling_ratio/max": 1.2803471088409424,
	"sampling/importance_sampling_ratio/mean": 0.9798120856285095,
	"sampling/importance_sampling_ratio/min": 0.28233107924461365,
	"sampling/sampling_logp_difference/max": 1.2646756172180176,
	"sampling/sampling_logp_difference/mean": 0.03255663067102432,
	"step": 93,
	"step_time": 26.499364807999882
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.09494142327457666,
	"epoch": 0.00188,
	"grad_norm": 0.005891559179872274,
	"kl": 0.4762792717665434,
	"learning_rate": 7.999995016311026e-06,
	"loss": -0.0,
	"step": 94,
	"step_time": 11.590511038999466
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.78125,
	"completions/mean_terminated_length": 2.78125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0855805806349963,
	"epoch": 0.0019,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010784839279949665,
	"kl": 0.5285673206672072,
	"learning_rate": 7.999994842978255e-06,
	"loss": 0.0,
	"num_tokens": 4999030.0,
	"reward": 2.307888984680176,
	"reward_std": 0.558517575263977,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.78125,
	"rewards/probe_completion_length/std": 0.420013427734375,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4233608841896057,
	"rewards/rollout_reward_func/std": 0.2430049329996109,
	"sampling/importance_sampling_ratio/max": 2.3040266036987305,
	"sampling/importance_sampling_ratio/mean": 1.0930638313293457,
	"sampling/importance_sampling_ratio/min": 0.26607653498649597,
	"sampling/sampling_logp_difference/max": 1.3239718675613403,
	"sampling/sampling_logp_difference/mean": 0.0572347566485405,
	"step": 95,
	"step_time": 27.32456371700073
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.08265005028806627,
	"epoch": 0.00192,
	"grad_norm": 0.009639889933168888,
	"kl": 0.5285577713511884,
	"learning_rate": 7.999994666682534e-06,
	"loss": 0.0,
	"step": 96,
	"step_time": 12.08934896799974
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10442803846672177,
	"epoch": 0.00194,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007832064293324947,
	"kl": 1.2743625693256035,
	"learning_rate": 7.999994487423863e-06,
	"loss": 0.0002,
	"num_tokens": 5101617.0,
	"reward": 2.3278391361236572,
	"reward_std": 0.21062178909778595,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5909109115600586,
	"rewards/rollout_reward_func/std": 0.17344380915164948,
	"sampling/importance_sampling_ratio/max": 1.2738028764724731,
	"sampling/importance_sampling_ratio/mean": 0.8911948204040527,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 2.880244493484497,
	"sampling/sampling_logp_difference/mean": 0.08490461856126785,
	"step": 97,
	"step_time": 26.761640363000424
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.09561855113133788,
	"epoch": 0.00196,
	"grad_norm": 0.0042576780542731285,
	"kl": 0.8573908178368583,
	"learning_rate": 7.999994305202242e-06,
	"loss": 0.0002,
	"step": 98,
	"step_time": 12.239888331999737
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.12256050202995539,
	"epoch": 0.00198,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.03982119634747505,
	"kl": 0.4613347239792347,
	"learning_rate": 7.999994120017672e-06,
	"loss": 0.0,
	"num_tokens": 5208185.0,
	"reward": 2.3622024059295654,
	"reward_std": 0.3201013505458832,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9496574401855469,
	"rewards/probe_shaping_dominance/std": 0.13700911402702332,
	"rewards/probe_terminal_raw/mean": 0.0570375993847847,
	"rewards/probe_terminal_raw/std": 0.15571396052837372,
	"rewards/rollout_reward_func/mean": -0.5007427334785461,
	"rewards/rollout_reward_func/std": 0.2577684223651886,
	"sampling/importance_sampling_ratio/max": 2.246042490005493,
	"sampling/importance_sampling_ratio/mean": 1.0854158401489258,
	"sampling/importance_sampling_ratio/min": 0.0747772604227066,
	"sampling/sampling_logp_difference/max": 2.5932421684265137,
	"sampling/sampling_logp_difference/mean": 0.07237481325864792,
	"step": 99,
	"step_time": 28.563245160000406
	},
	{
	"clip_ratio/high_max": 0.05000000074505806,
	"clip_ratio/high_mean": 0.02500000037252903,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04583333432674408,
	"entropy": 0.11652607470750809,
	"epoch": 0.002,
	"grad_norm": 0.013196082785725594,
	"kl": 1.1047777848725673,
	"learning_rate": 7.999993931870152e-06,
	"loss": -0.0,
	"step": 100,
	"step_time": 11.832685018998745
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11155627248808742,
	"epoch": 0.00202,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.011043570004403591,
	"kl": 0.8486065305769444,
	"learning_rate": 7.999993740759685e-06,
	"loss": 0.0,
	"num_tokens": 5312092.0,
	"reward": 2.469048261642456,
	"reward_std": 0.296406090259552,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 0.9914976358413696,
	"rewards/probe_shaping_dominance/std": 0.04809650778770447,
	"rewards/probe_terminal_raw/mean": 0.00889227632433176,
	"rewards/probe_terminal_raw/std": 0.05030231550335884,
	"rewards/rollout_reward_func/mean": -0.5125917196273804,
	"rewards/rollout_reward_func/std": 0.1837811917066574,
	"sampling/importance_sampling_ratio/max": 1.2519433498382568,
	"sampling/importance_sampling_ratio/mean": 0.8515626192092896,
	"sampling/importance_sampling_ratio/min": 0.08545338362455368,
	"sampling/sampling_logp_difference/max": 2.4583053588867188,
	"sampling/sampling_logp_difference/mean": 0.1055741012096405,
	"step": 101,
	"step_time": 28.246981163999408
	},
	{
	"clip_ratio/high_max": 0.0833333358168602,
	"clip_ratio/high_mean": 0.0416666679084301,
	"clip_ratio/low_mean": 0.031250000931322575,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.07291666883975267,
	"entropy": 0.10925065912306309,
	"epoch": 0.00204,
	"grad_norm": 0.008332287892699242,
	"kl": 0.7459432929754257,
	"learning_rate": 7.999993546686268e-06,
	"loss": 0.0,
	"step": 102,
	"step_time": 12.24685298599934
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09578724391758442,
	"epoch": 0.00206,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005429553799331188,
	"kl": 0.3181111275916919,
	"learning_rate": 7.999993349649902e-06,
	"loss": 0.0001,
	"num_tokens": 5417356.0,
	"reward": 2.296133279800415,
	"reward_std": 0.48034343123435974,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.983701229095459,
	"rewards/probe_shaping_dominance/std": 0.0652911514043808,
	"rewards/probe_terminal_raw/mean": 0.021214431151747704,
	"rewards/probe_terminal_raw/std": 0.08383625000715256,
	"rewards/rollout_reward_func/mean": -0.5025323629379272,
	"rewards/rollout_reward_func/std": 0.23934274911880493,
	"sampling/importance_sampling_ratio/max": 1.7521827220916748,
	"sampling/importance_sampling_ratio/mean": 1.0161978006362915,
	"sampling/importance_sampling_ratio/min": 0.559285044670105,
	"sampling/sampling_logp_difference/max": 0.5810226202011108,
	"sampling/sampling_logp_difference/mean": 0.03578226640820503,
	"step": 103,
	"step_time": 28.179791414999727
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.09543535858392715,
	"epoch": 0.00208,
	"grad_norm": 0.005383977200835943,
	"kl": 0.31692405231297016,
	"learning_rate": 7.999993149650587e-06,
	"loss": 0.0,
	"step": 104,
	"step_time": 11.594287923999673
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10437362408265471,
	"epoch": 0.0021,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006486265454441309,
	"kl": 0.4273503478616476,
	"learning_rate": 7.999992946688324e-06,
	"loss": -0.0,
	"num_tokens": 5522766.0,
	"reward": 2.39151668548584,
	"reward_std": 0.39364051818847656,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5272334814071655,
	"rewards/rollout_reward_func/std": 0.2972264289855957,
	"sampling/importance_sampling_ratio/max": 1.9010006189346313,
	"sampling/importance_sampling_ratio/mean": 1.0246827602386475,
	"sampling/importance_sampling_ratio/min": 0.3678455054759979,
	"sampling/sampling_logp_difference/max": 1.0000989437103271,
	"sampling/sampling_logp_difference/mean": 0.03773331269621849,
	"step": 105,
	"step_time": 26.660096251999676
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.09778932714834809,
	"epoch": 0.00212,
	"grad_norm": 0.005733635742217302,
	"kl": 0.36536745447665453,
	"learning_rate": 7.999992740763114e-06,
	"loss": -0.0,
	"step": 106,
	"step_time": 12.020263065000563
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09189990477170795,
	"epoch": 0.00214,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006950075738132,
	"kl": 0.37158518051728606,
	"learning_rate": 7.999992531874955e-06,
	"loss": 0.0,
	"num_tokens": 5624278.0,
	"reward": 2.3239517211914062,
	"reward_std": 0.4278637170791626,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9810667037963867,
	"rewards/probe_shaping_dominance/std": 0.0745052620768547,
	"rewards/probe_terminal_raw/mean": 0.021214431151747704,
	"rewards/probe_terminal_raw/std": 0.08405215293169022,
	"rewards/rollout_reward_func/mean": -0.472079336643219,
	"rewards/rollout_reward_func/std": 0.24182648956775665,
	"sampling/importance_sampling_ratio/max": 1.8587580919265747,
	"sampling/importance_sampling_ratio/mean": 0.9948133230209351,
	"sampling/importance_sampling_ratio/min": 0.488203763961792,
	"sampling/sampling_logp_difference/max": 0.6990102529525757,
	"sampling/sampling_logp_difference/mean": 0.03366800397634506,
	"step": 107,
	"step_time": 27.280253950999395
	},
	{
	"clip_ratio/high_max": 0.06666666828095913,
	"clip_ratio/high_mean": 0.033333334140479565,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04375000111758709,
	"entropy": 0.07755104900570586,
	"epoch": 0.00216,
	"grad_norm": 0.0029529579915106297,
	"kl": 0.3871547483528275,
	"learning_rate": 7.99999232002385e-06,
	"loss": 0.0,
	"step": 108,
	"step_time": 11.582099404000473
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04583333432674408,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08607161836698651,
	"epoch": 0.00218,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004912302363663912,
	"kl": 0.3110020191234071,
	"learning_rate": 7.999992105209796e-06,
	"loss": 0.0,
	"num_tokens": 5730240.0,
	"reward": 2.3713436126708984,
	"reward_std": 0.34508299827575684,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9868378639221191,
	"rewards/probe_shaping_dominance/std": 0.07445620000362396,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5186194181442261,
	"rewards/rollout_reward_func/std": 0.22763106226921082,
	"sampling/importance_sampling_ratio/max": 2.4666221141815186,
	"sampling/importance_sampling_ratio/mean": 0.9437046051025391,
	"sampling/importance_sampling_ratio/min": 0.16313567757606506,
	"sampling/sampling_logp_difference/max": 1.8131763935089111,
	"sampling/sampling_logp_difference/mean": 0.07055296003818512,
	"step": 109,
	"step_time": 27.85804966900014
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.08376848418265581,
	"epoch": 0.0022,
	"grad_norm": 0.021030370146036148,
	"kl": 0.3346872879192233,
	"learning_rate": 7.999991887432795e-06,
	"loss": 0.0,
	"step": 110,
	"step_time": 12.221424097000181
	},
	{
	"clip_ratio/high_max": 0.03125,
	"clip_ratio/high_mean": 0.015625,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.03645833395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09564799422514625,
	"epoch": 0.00222,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010623163543641567,
	"kl": 1.25646445970051,
	"learning_rate": 7.999991666692848e-06,
	"loss": 0.0001,
	"num_tokens": 5834866.0,
	"reward": 2.371830463409424,
	"reward_std": 0.455732524394989,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9698338508605957,
	"rewards/probe_shaping_dominance/std": 0.118809275329113,
	"rewards/probe_terminal_raw/mean": 0.02909044735133648,
	"rewards/probe_terminal_raw/std": 0.11480555683374405,
	"rewards/rollout_reward_func/mean": -0.45209401845932007,
	"rewards/rollout_reward_func/std": 0.2390637993812561,
	"sampling/importance_sampling_ratio/max": 2.435302972793579,
	"sampling/importance_sampling_ratio/mean": 0.9616929292678833,
	"sampling/importance_sampling_ratio/min": 0.18086190521717072,
	"sampling/sampling_logp_difference/max": 1.7100262641906738,
	"sampling/sampling_logp_difference/mean": 0.06157621741294861,
	"step": 111,
	"step_time": 27.536669213000096
	},
	{
	"clip_ratio/high_max": 0.05625000037252903,
	"clip_ratio/high_mean": 0.028125000186264515,
	"clip_ratio/low_mean": 0.031250000931322575,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.05937500111758709,
	"entropy": 0.09540150425164029,
	"epoch": 0.00224,
	"grad_norm": 0.005310059990733862,
	"kl": 0.7572433853056282,
	"learning_rate": 7.999991442989953e-06,
	"loss": 0.0001,
	"step": 112,
	"step_time": 11.58020766800064
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05377835238323314,
	"epoch": 0.00226,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0054777092300355434,
	"kl": 0.2139036045409739,
	"learning_rate": 7.999991216324112e-06,
	"loss": 0.0,
	"num_tokens": 5941971.0,
	"reward": 2.3715004920959473,
	"reward_std": 0.3570369482040405,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9854661822319031,
	"rewards/probe_shaping_dominance/std": 0.08221564441919327,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5483407378196716,
	"rewards/rollout_reward_func/std": 0.21500766277313232,
	"sampling/importance_sampling_ratio/max": 1.468092441558838,
	"sampling/importance_sampling_ratio/mean": 1.0448389053344727,
	"sampling/importance_sampling_ratio/min": 0.9520513415336609,
	"sampling/sampling_logp_difference/max": 0.38396334648132324,
	"sampling/sampling_logp_difference/mean": 0.014699834398925304,
	"step": 113,
	"step_time": 26.95208743199919
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.05231661406651256,
	"epoch": 0.00228,
	"grad_norm": 0.005958650726824999,
	"kl": 0.20708634098750167,
	"learning_rate": 7.999990986695325e-06,
	"loss": 0.0,
	"step": 114,
	"step_time": 12.898005667000234
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1277365549467504,
	"epoch": 0.0023,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010303696617484093,
	"kl": 0.5558968242257833,
	"learning_rate": 7.999990754103591e-06,
	"loss": -0.0,
	"num_tokens": 6048989.0,
	"reward": 2.3545703887939453,
	"reward_std": 0.32267555594444275,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5329294204711914,
	"rewards/rollout_reward_func/std": 0.1960861086845398,
	"sampling/importance_sampling_ratio/max": 2.528221368789673,
	"sampling/importance_sampling_ratio/mean": 0.9982080459594727,
	"sampling/importance_sampling_ratio/min": 0.042695675045251846,
	"sampling/sampling_logp_difference/max": 3.153654098510742,
	"sampling/sampling_logp_difference/mean": 0.08483341336250305,
	"step": 115,
	"step_time": 28.715120017999652
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.031250000931322575,
	"clip_ratio/low_min": 0.02083333395421505,
	"clip_ratio/region_mean": 0.0416666679084301,
	"entropy": 0.1117813317105174,
	"epoch": 0.00232,
	"grad_norm": 0.006610220763832331,
	"kl": 0.6069826502352953,
	"learning_rate": 7.99999051854891e-06,
	"loss": -0.0,
	"step": 116,
	"step_time": 12.037885646000177
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.035416667349636555,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.035416667349636555,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08630842622369528,
	"epoch": 0.00234,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.023052336648106575,
	"kl": 4.202049997946233,
	"learning_rate": 7.999990280031285e-06,
	"loss": -0.0,
	"num_tokens": 6156241.0,
	"reward": 2.3509585857391357,
	"reward_std": 0.3719061613082886,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5052914619445801,
	"rewards/rollout_reward_func/std": 0.26930469274520874,
	"sampling/importance_sampling_ratio/max": 1.4201393127441406,
	"sampling/importance_sampling_ratio/mean": 0.9191266298294067,
	"sampling/importance_sampling_ratio/min": 0.04002097621560097,
	"sampling/sampling_logp_difference/max": 3.218353271484375,
	"sampling/sampling_logp_difference/mean": 0.08381534367799759,
	"step": 117,
	"step_time": 27.4478307280001
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.08493484603241086,
	"epoch": 0.00236,
	"grad_norm": 0.005157412961125374,
	"kl": 0.8633453572015242,
	"learning_rate": 7.999990038550715e-06,
	"loss": -0.0001,
	"step": 118,
	"step_time": 12.410220233000018
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1009751778037753,
	"epoch": 0.00238,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007704886142164469,
	"kl": 1.133708338191262,
	"learning_rate": 7.9999897941072e-06,
	"loss": -0.0,
	"num_tokens": 6261608.0,
	"reward": 2.272282600402832,
	"reward_std": 0.4321046769618988,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5527174472808838,
	"rewards/rollout_reward_func/std": 0.2261652648448944,
	"sampling/importance_sampling_ratio/max": 1.9247888326644897,
	"sampling/importance_sampling_ratio/mean": 0.9601424932479858,
	"sampling/importance_sampling_ratio/min": 0.10850485414266586,
	"sampling/sampling_logp_difference/max": 2.221635580062866,
	"sampling/sampling_logp_difference/mean": 0.06387770175933838,
	"step": 119,
	"step_time": 27.243004307998945
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.10485226087621413,
	"epoch": 0.0024,
	"grad_norm": 0.005486879497766495,
	"kl": 0.7662449008450487,
	"learning_rate": 7.999989546700739e-06,
	"loss": -0.0,
	"step": 120,
	"step_time": 11.642901553001138
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05734692560508847,
	"epoch": 0.00242,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0022395749110728502,
	"kl": 0.4620458657536801,
	"learning_rate": 7.999989296331334e-06,
	"loss": 0.0,
	"num_tokens": 6364884.0,
	"reward": 2.300528049468994,
	"reward_std": 0.3925109803676605,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9786701202392578,
	"rewards/probe_shaping_dominance/std": 0.08399670571088791,
	"rewards/probe_terminal_raw/mean": 0.020452234894037247,
	"rewards/probe_terminal_raw/std": 0.08055972307920456,
	"rewards/rollout_reward_func/mean": -0.5235942602157593,
	"rewards/rollout_reward_func/std": 0.19283899664878845,
	"sampling/importance_sampling_ratio/max": 1.684720754623413,
	"sampling/importance_sampling_ratio/mean": 0.9979562163352966,
	"sampling/importance_sampling_ratio/min": 0.3297406733036041,
	"sampling/sampling_logp_difference/max": 1.109449863433838,
	"sampling/sampling_logp_difference/mean": 0.03222563862800598,
	"step": 121,
	"step_time": 27.102160742999786
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.06036481284536421,
	"epoch": 0.00244,
	"grad_norm": 0.0021346518769860268,
	"kl": 0.460031573350534,
	"learning_rate": 7.999989042998983e-06,
	"loss": 0.0,
	"step": 122,
	"step_time": 12.627941945999737
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 3.0,
	"completions/mean_terminated_length": 3.0,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 0.08197857672348619,
	"epoch": 0.00246,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005835927091538906,
	"kl": 0.3058228840382071,
	"learning_rate": 7.99998878670369e-06,
	"loss": -0.0,
	"num_tokens": 6470259.0,
	"reward": 2.4272561073303223,
	"reward_std": 0.2215338796377182,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 2.0,
	"rewards/probe_completion_length/std": 0.0,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9598830342292786,
	"rewards/probe_shaping_dominance/std": 0.13210204243659973,
	"rewards/probe_terminal_raw/mean": 0.04026930779218674,
	"rewards/probe_terminal_raw/std": 0.13092826306819916,
	"rewards/rollout_reward_func/mean": -0.5228960514068604,
	"rewards/rollout_reward_func/std": 0.22377446293830872,
	"sampling/importance_sampling_ratio/max": 1.2321637868881226,
	"sampling/importance_sampling_ratio/mean": 0.9182083606719971,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 1.2927324771881104,
	"sampling/sampling_logp_difference/mean": 0.04780565947294235,
	"step": 123,
	"step_time": 27.481588907000514
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.0761918865609914,
	"epoch": 0.00248,
	"grad_norm": 0.005192534998059273,
	"kl": 0.32337066042236984,
	"learning_rate": 7.999988527445453e-06,
	"loss": -0.0,
	"step": 124,
	"step_time": 11.74153527999988
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.061301857323996956,
	"epoch": 0.0025,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004524994175881147,
	"kl": 0.20127144705232547,
	"learning_rate": 7.99998826522427e-06,
	"loss": -0.0,
	"num_tokens": 6573122.0,
	"reward": 2.5412168502807617,
	"reward_std": 0.4934008717536926,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.3535533845424652,
	"rewards/probe_shaping_dominance/mean": 0.9729976058006287,
	"rewards/probe_shaping_dominance/std": 0.10630916804075241,
	"rewards/probe_terminal_raw/mean": 0.028963414952158928,
	"rewards/probe_terminal_raw/std": 0.11434794962406158,
	"rewards/rollout_reward_func/mean": -0.44199419021606445,
	"rewards/rollout_reward_func/std": 0.23288173973560333,
	"sampling/importance_sampling_ratio/max": 2.8899707794189453,
	"sampling/importance_sampling_ratio/mean": 1.0233311653137207,
	"sampling/importance_sampling_ratio/min": 0.5645219683647156,
	"sampling/sampling_logp_difference/max": 1.0612452030181885,
	"sampling/sampling_logp_difference/mean": 0.02934853918850422,
	"step": 125,
	"step_time": 26.56314809100013
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.0587867568974616,
	"epoch": 0.00252,
	"grad_norm": 0.003286329098045826,
	"kl": 0.23132333873703226,
	"learning_rate": 7.999988000040144e-06,
	"loss": -0.0,
	"step": 126,
	"step_time": 12.704706686999543
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06930449209176004,
	"epoch": 0.00254,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0038534458726644516,
	"kl": 0.8923099512467161,
	"learning_rate": 7.999987731893076e-06,
	"loss": -0.0001,
	"num_tokens": 6674759.0,
	"reward": 2.476976156234741,
	"reward_std": 0.5018807053565979,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.09375,
	"rewards/probe_invalid_count/std": 0.39015090465545654,
	"rewards/probe_shaping_dominance/mean": 0.9897805452346802,
	"rewards/probe_shaping_dominance/std": 0.057810164988040924,
	"rewards/probe_terminal_raw/mean": 0.010797764174640179,
	"rewards/probe_terminal_raw/std": 0.06108137592673302,
	"rewards/rollout_reward_func/mean": -0.5048520565032959,
	"rewards/rollout_reward_func/std": 0.23183932900428772,
	"sampling/importance_sampling_ratio/max": 2.6555376052856445,
	"sampling/importance_sampling_ratio/mean": 1.037369728088379,
	"sampling/importance_sampling_ratio/min": 0.18285271525382996,
	"sampling/sampling_logp_difference/max": 1.6990761756896973,
	"sampling/sampling_logp_difference/mean": 0.04799798130989075,
	"step": 127,
	"step_time": 26.519593818999965
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.07739454251714051,
	"epoch": 0.00256,
	"grad_norm": 0.0046963742934167385,
	"kl": 0.8950551702291705,
	"learning_rate": 7.999987460783066e-06,
	"loss": -0.0001,
	"step": 128,
	"step_time": 11.701040565999392
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04215445008594543,
	"epoch": 0.00258,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004038817714899778,
	"kl": 0.483372636698145,
	"learning_rate": 7.999987186710111e-06,
	"loss": -0.0001,
	"num_tokens": 6778164.0,
	"reward": 2.3669238090515137,
	"reward_std": 0.33272045850753784,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9665718078613281,
	"rewards/probe_shaping_dominance/std": 0.11594124883413315,
	"rewards/probe_terminal_raw/mean": 0.033663615584373474,
	"rewards/probe_terminal_raw/std": 0.11093832552433014,
	"rewards/rollout_reward_func/mean": -0.5208115577697754,
	"rewards/rollout_reward_func/std": 0.22583386301994324,
	"sampling/importance_sampling_ratio/max": 1.324372410774231,
	"sampling/importance_sampling_ratio/mean": 0.9827702045440674,
	"sampling/importance_sampling_ratio/min": 0.15934889018535614,
	"sampling/sampling_logp_difference/max": 1.8366597890853882,
	"sampling/sampling_logp_difference/mean": 0.03050372563302517,
	"step": 129,
	"step_time": 29.272002608000093
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.040716532384976745,
	"epoch": 0.0026,
	"grad_norm": 0.004598686005920172,
	"kl": 0.48791675676284285,
	"learning_rate": 7.999986909674215e-06,
	"loss": -0.0001,
	"step": 130,
	"step_time": 11.615075072000309
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07680852155863249,
	"epoch": 0.00262,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004599301610141993,
	"kl": 0.5561261102557182,
	"learning_rate": 7.999986629675377e-06,
	"loss": 0.0001,
	"num_tokens": 6881343.0,
	"reward": 2.428385019302368,
	"reward_std": 0.35835328698158264,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.988267183303833,
	"rewards/probe_shaping_dominance/std": 0.06637061387300491,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.4630073308944702,
	"rewards/rollout_reward_func/std": 0.23799148201942444,
	"sampling/importance_sampling_ratio/max": 2.105088472366333,
	"sampling/importance_sampling_ratio/mean": 1.0250680446624756,
	"sampling/importance_sampling_ratio/min": 0.24339471757411957,
	"sampling/sampling_logp_difference/max": 1.413072109222412,
	"sampling/sampling_logp_difference/mean": 0.05859563127160072,
	"step": 131,
	"step_time": 27.499229768000532
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.07459181371518753,
	"epoch": 0.00264,
	"grad_norm": 0.0046109952963888645,
	"kl": 0.4819548297673464,
	"learning_rate": 7.999986346713597e-06,
	"loss": 0.0001,
	"step": 132,
	"step_time": 11.681140706999486
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06963892979547381,
	"epoch": 0.00266,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004053663462400436,
	"kl": 0.29985905811190605,
	"learning_rate": 7.999986060788874e-06,
	"loss": -0.0001,
	"num_tokens": 6984936.0,
	"reward": 2.398922920227051,
	"reward_std": 0.3926793932914734,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9907037019729614,
	"rewards/probe_shaping_dominance/std": 0.052587706595659256,
	"rewards/probe_terminal_raw/mean": 0.007876016199588776,
	"rewards/probe_terminal_raw/std": 0.0445534773170948,
	"rewards/rollout_reward_func/mean": -0.45590683817863464,
	"rewards/rollout_reward_func/std": 0.20304201543331146,
	"sampling/importance_sampling_ratio/max": 1.1057724952697754,
	"sampling/importance_sampling_ratio/mean": 0.917495846748352,
	"sampling/importance_sampling_ratio/min": 0.2753896415233612,
	"sampling/sampling_logp_difference/max": 1.2891517877578735,
	"sampling/sampling_logp_difference/mean": 0.049349602311849594,
	"step": 133,
	"step_time": 28.668226430000686
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.07001902349293232,
	"epoch": 0.00268,
	"grad_norm": 0.0046079279854893684,
	"kl": 0.30660303554032,
	"learning_rate": 7.999985771901212e-06,
	"loss": -0.0001,
	"step": 134,
	"step_time": 11.78814972499913
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0837576383491978,
	"epoch": 0.0027,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004887988790869713,
	"kl": 0.48908784112427384,
	"learning_rate": 7.999985480050609e-06,
	"loss": 0.0,
	"num_tokens": 7089375.0,
	"reward": 2.383143901824951,
	"reward_std": 0.2860008180141449,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9715699553489685,
	"rewards/probe_shaping_dominance/std": 0.11188202351331711,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.5384261608123779,
	"rewards/rollout_reward_func/std": 0.24632836878299713,
	"sampling/importance_sampling_ratio/max": 2.175699234008789,
	"sampling/importance_sampling_ratio/mean": 0.9764343500137329,
	"sampling/importance_sampling_ratio/min": 0.37150871753692627,
	"sampling/sampling_logp_difference/max": 1.0082650184631348,
	"sampling/sampling_logp_difference/mean": 0.04385855793952942,
	"step": 135,
	"step_time": 27.26713926100001
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07945482805371284,
	"epoch": 0.00272,
	"grad_norm": 0.005393319763243198,
	"kl": 0.4894396271556616,
	"learning_rate": 7.999985185237063e-06,
	"loss": 0.0,
	"step": 136,
	"step_time": 11.740167015000225
	},
	{
	"clip_ratio/high_max": 0.012500000186264515,
	"clip_ratio/high_mean": 0.0062500000931322575,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.016666667070239782,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 7.0,
	"completions/max_terminated_length": 7.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07358018541708589,
	"epoch": 0.00274,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.04956609383225441,
	"kl": 7.594387605204247,
	"learning_rate": 7.999984887460579e-06,
	"loss": 0.0,
	"num_tokens": 7195651.0,
	"reward": 2.523413896560669,
	"reward_std": 1.283755898475647,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 2.0,
	"rewards/probe_completion_length/std": 1.1639753580093384,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9845632314682007,
	"rewards/probe_shaping_dominance/std": 0.08732341974973679,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.4580242931842804,
	"rewards/rollout_reward_func/std": 0.29842740297317505,
	"sampling/importance_sampling_ratio/max": 1.5995361804962158,
	"sampling/importance_sampling_ratio/mean": 0.9101204872131348,
	"sampling/importance_sampling_ratio/min": 0.2878796458244324,
	"sampling/sampling_logp_difference/max": 1.2452144622802734,
	"sampling/sampling_logp_difference/mean": 0.08170486986637115,
	"step": 137,
	"step_time": 35.5617492829997
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04583333432674408,
	"entropy": 0.0833338184747845,
	"epoch": 0.00276,
	"grad_norm": 0.004238603170961142,
	"kl": 0.8713670628203545,
	"learning_rate": 7.999984586721153e-06,
	"loss": -0.0001,
	"step": 138,
	"step_time": 13.092057540999122
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.077066877449397,
	"epoch": 0.00278,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006206016521900892,
	"kl": 0.2502201258515315,
	"learning_rate": 7.999984283018788e-06,
	"loss": -0.0001,
	"num_tokens": 7298420.0,
	"reward": 2.434345006942749,
	"reward_std": 0.33564823865890503,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4531550407409668,
	"rewards/rollout_reward_func/std": 0.23981256783008575,
	"sampling/importance_sampling_ratio/max": 1.5575754642486572,
	"sampling/importance_sampling_ratio/mean": 0.9945090413093567,
	"sampling/importance_sampling_ratio/min": 0.39499369263648987,
	"sampling/sampling_logp_difference/max": 0.9288842678070068,
	"sampling/sampling_logp_difference/mean": 0.0369817316532135,
	"step": 139,
	"step_time": 26.636275078999915
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.08048825367586687,
	"epoch": 0.0028,
	"grad_norm": 0.004995269235223532,
	"kl": 0.1949386877240613,
	"learning_rate": 7.999983976353484e-06,
	"loss": -0.0001,
	"step": 140,
	"step_time": 11.886442712999724
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09608687367290258,
	"epoch": 0.00282,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010193965397775173,
	"kl": 1.043814627239044,
	"learning_rate": 7.99998366672524e-06,
	"loss": 0.0001,
	"num_tokens": 7400213.0,
	"reward": 2.357463836669922,
	"reward_std": 0.45996955037117004,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9802613258361816,
	"rewards/probe_shaping_dominance/std": 0.0787685438990593,
	"rewards/probe_terminal_raw/mean": 0.017403453588485718,
	"rewards/probe_terminal_raw/std": 0.06857709586620331,
	"rewards/rollout_reward_func/mean": -0.46520087122917175,
	"rewards/rollout_reward_func/std": 0.23765753209590912,
	"sampling/importance_sampling_ratio/max": 2.0903208255767822,
	"sampling/importance_sampling_ratio/mean": 1.064300775527954,
	"sampling/importance_sampling_ratio/min": 0.2817336320877075,
	"sampling/sampling_logp_difference/max": 1.266794204711914,
	"sampling/sampling_logp_difference/mean": 0.04518420994281769,
	"step": 141,
	"step_time": 27.64493636099951
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.10309219686314464,
	"epoch": 0.00284,
	"grad_norm": 0.01219659112393856,
	"kl": 0.6812123054987751,
	"learning_rate": 7.999983354134058e-06,
	"loss": 0.0,
	"step": 142,
	"step_time": 11.569478897000408
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07752494711894542,
	"epoch": 0.00286,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004447088576853275,
	"kl": 0.28799188635699124,
	"learning_rate": 7.999983038579937e-06,
	"loss": -0.0002,
	"num_tokens": 7502202.0,
	"reward": 2.4029557704925537,
	"reward_std": 0.41433292627334595,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4220443069934845,
	"rewards/rollout_reward_func/std": 0.2347659468650818,
	"sampling/importance_sampling_ratio/max": 2.925204277038574,
	"sampling/importance_sampling_ratio/mean": 1.0200954675674438,
	"sampling/importance_sampling_ratio/min": 0.2386324405670166,
	"sampling/sampling_logp_difference/max": 1.4322543144226074,
	"sampling/sampling_logp_difference/mean": 0.04332014173269272,
	"step": 143,
	"step_time": 27.17438340000035
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07426338468212634,
	"epoch": 0.00288,
	"grad_norm": 0.004469662439078093,
	"kl": 0.2410876297701634,
	"learning_rate": 7.999982720062878e-06,
	"loss": -0.0002,
	"step": 144,
	"step_time": 12.213636597999539
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08634203940164298,
	"epoch": 0.0029,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002921090926975012,
	"kl": 0.230285348889538,
	"learning_rate": 7.99998239858288e-06,
	"loss": 0.0,
	"num_tokens": 7607649.0,
	"reward": 2.3042469024658203,
	"reward_std": 0.4113651216030121,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5207530856132507,
	"rewards/rollout_reward_func/std": 0.2033592164516449,
	"sampling/importance_sampling_ratio/max": 1.081487774848938,
	"sampling/importance_sampling_ratio/mean": 0.961658239364624,
	"sampling/importance_sampling_ratio/min": 0.3403857946395874,
	"sampling/sampling_logp_difference/max": 0.7405810356140137,
	"sampling/sampling_logp_difference/mean": 0.02413717657327652,
	"step": 145,
	"step_time": 28.17627675400081
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.08849173790076748,
	"epoch": 0.00292,
	"grad_norm": 0.0025327985640615225,
	"kl": 0.24220079024462393,
	"learning_rate": 7.999982074139944e-06,
	"loss": 0.0,
	"step": 146,
	"step_time": 11.552079900000535
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11541430978104472,
	"epoch": 0.00294,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0037195596378296614,
	"kl": 0.24169684358639643,
	"learning_rate": 7.999981746734073e-06,
	"loss": -0.0001,
	"num_tokens": 7714926.0,
	"reward": 2.362529754638672,
	"reward_std": 0.3588845729827881,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9859415292739868,
	"rewards/probe_shaping_dominance/std": 0.07952678948640823,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5265366435050964,
	"rewards/rollout_reward_func/std": 0.2366112768650055,
	"sampling/importance_sampling_ratio/max": 1.8165228366851807,
	"sampling/importance_sampling_ratio/mean": 1.0579065084457397,
	"sampling/importance_sampling_ratio/min": 0.4353120028972626,
	"sampling/sampling_logp_difference/max": 0.826627790927887,
	"sampling/sampling_logp_difference/mean": 0.04029189795255661,
	"step": 147,
	"step_time": 27.175546237000162
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.1161738510709256,
	"epoch": 0.00296,
	"grad_norm": 0.0037887210492044687,
	"kl": 0.23712664423510432,
	"learning_rate": 7.999981416365263e-06,
	"loss": -0.0,
	"step": 148,
	"step_time": 12.20823843899916
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.105490946007194,
	"epoch": 0.00298,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005545547232031822,
	"kl": 0.10429394743793807,
	"learning_rate": 7.999981083033518e-06,
	"loss": -0.0,
	"num_tokens": 7820271.0,
	"reward": 2.2831099033355713,
	"reward_std": 0.39255067706108093,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5418901443481445,
	"rewards/rollout_reward_func/std": 0.2250201553106308,
	"sampling/importance_sampling_ratio/max": 1.449048399925232,
	"sampling/importance_sampling_ratio/mean": 0.9792050719261169,
	"sampling/importance_sampling_ratio/min": 0.2817993760108948,
	"sampling/sampling_logp_difference/max": 1.2665607929229736,
	"sampling/sampling_logp_difference/mean": 0.03002801164984703,
	"step": 149,
	"step_time": 27.53580150099924
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.10439014286384918,
	"epoch": 0.003,
	"grad_norm": 0.00822756253182888,
	"kl": 0.11194274778247859,
	"learning_rate": 7.999980746738835e-06,
	"loss": -0.0,
	"step": 150,
	"step_time": 11.669001740000112
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1254521356895566,
	"epoch": 0.00302,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008205846883356571,
	"kl": 0.2568075335584581,
	"learning_rate": 7.999980407481217e-06,
	"loss": -0.0,
	"num_tokens": 7922328.0,
	"reward": 2.4083704948425293,
	"reward_std": 0.3905543088912964,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9785879850387573,
	"rewards/probe_shaping_dominance/std": 0.08963118493556976,
	"rewards/probe_terminal_raw/mean": 0.0260416679084301,
	"rewards/probe_terminal_raw/std": 0.1046360433101654,
	"rewards/rollout_reward_func/mean": -0.45250916481018066,
	"rewards/rollout_reward_func/std": 0.25463223457336426,
	"sampling/importance_sampling_ratio/max": 1.165947437286377,
	"sampling/importance_sampling_ratio/mean": 0.9090801477432251,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 1.9794785976409912,
	"sampling/sampling_logp_difference/mean": 0.06048261374235153,
	"step": 151,
	"step_time": 25.965173581000272
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.11868520639836788,
	"epoch": 0.00304,
	"grad_norm": 0.008953132666647434,
	"kl": 0.6233456870540977,
	"learning_rate": 7.999980065260663e-06,
	"loss": -0.0001,
	"step": 152,
	"step_time": 12.843935258000784
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11048904561903328,
	"epoch": 0.00306,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00968129187822342,
	"kl": 0.14061896470107627,
	"learning_rate": 7.999979720077173e-06,
	"loss": -0.0,
	"num_tokens": 8026423.0,
	"reward": 2.419642925262451,
	"reward_std": 0.30986252427101135,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9758727550506592,
	"rewards/probe_shaping_dominance/std": 0.10938115417957306,
	"rewards/probe_terminal_raw/mean": 0.0209603663533926,
	"rewards/probe_terminal_raw/std": 0.09247327595949173,
	"rewards/rollout_reward_func/mean": -0.49594029784202576,
	"rewards/rollout_reward_func/std": 0.2378591150045395,
	"sampling/importance_sampling_ratio/max": 1.1600902080535889,
	"sampling/importance_sampling_ratio/mean": 0.9520583152770996,
	"sampling/importance_sampling_ratio/min": 0.5003088712692261,
	"sampling/sampling_logp_difference/max": 0.6657150983810425,
	"sampling/sampling_logp_difference/mean": 0.025925474241375923,
	"step": 153,
	"step_time": 26.941947170000276
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.1122089575510472,
	"epoch": 0.00308,
	"grad_norm": 0.00867843721061945,
	"kl": 0.15484224071647645,
	"learning_rate": 7.99997937193075e-06,
	"loss": -0.0,
	"step": 154,
	"step_time": 11.658896313999776
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0629729179199785,
	"epoch": 0.0031,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003953923936933279,
	"kl": 0.03362982640601331,
	"learning_rate": 7.99997902082139e-06,
	"loss": 0.0,
	"num_tokens": 8134364.0,
	"reward": 2.304103374481201,
	"reward_std": 0.3902580142021179,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9852296113967896,
	"rewards/probe_shaping_dominance/std": 0.08355414122343063,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5217512845993042,
	"rewards/rollout_reward_func/std": 0.20511233806610107,
	"sampling/importance_sampling_ratio/max": 1.2205973863601685,
	"sampling/importance_sampling_ratio/mean": 0.9658781290054321,
	"sampling/importance_sampling_ratio/min": 0.46778079867362976,
	"sampling/sampling_logp_difference/max": 0.7597565650939941,
	"sampling/sampling_logp_difference/mean": 0.021998237818479538,
	"step": 155,
	"step_time": 27.223922481999125
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.06562891032081097,
	"epoch": 0.00312,
	"grad_norm": 0.004405137151479721,
	"kl": 0.038039611198541934,
	"learning_rate": 7.999978666749097e-06,
	"loss": 0.0,
	"step": 156,
	"step_time": 12.512135376999595
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05827112344559282,
	"epoch": 0.00314,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004053921438753605,
	"kl": 0.22048271807530284,
	"learning_rate": 7.99997830971387e-06,
	"loss": -0.0,
	"num_tokens": 8238748.0,
	"reward": 2.4397072792053223,
	"reward_std": 0.3176124691963196,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9892078638076782,
	"rewards/probe_shaping_dominance/std": 0.061049580574035645,
	"rewards/probe_terminal_raw/mean": 0.010670731775462627,
	"rewards/probe_terminal_raw/std": 0.06036277487874031,
	"rewards/rollout_reward_func/mean": -0.5101712346076965,
	"rewards/rollout_reward_func/std": 0.20784814655780792,
	"sampling/importance_sampling_ratio/max": 1.6952624320983887,
	"sampling/importance_sampling_ratio/mean": 0.9711546301841736,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 1.1546943187713623,
	"sampling/sampling_logp_difference/mean": 0.03182876855134964,
	"step": 157,
	"step_time": 27.540745071999936
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.031250000931322575,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.058620097348466516,
	"epoch": 0.00316,
	"grad_norm": 0.0032319524325430393,
	"kl": 0.2064171105599364,
	"learning_rate": 7.999977949715709e-06,
	"loss": -0.0,
	"step": 158,
	"step_time": 11.632630814000095
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08823958231369033,
	"epoch": 0.00318,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005462405737489462,
	"kl": 0.09290702206544665,
	"learning_rate": 7.999977586754615e-06,
	"loss": 0.0001,
	"num_tokens": 8341164.0,
	"reward": 2.443883180618286,
	"reward_std": 0.2663474678993225,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9600480794906616,
	"rewards/probe_shaping_dominance/std": 0.12650074064731598,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.48178985714912415,
	"rewards/rollout_reward_func/std": 0.22425328195095062,
	"sampling/importance_sampling_ratio/max": 1.382658839225769,
	"sampling/importance_sampling_ratio/mean": 1.018369197845459,
	"sampling/importance_sampling_ratio/min": 0.8050516247749329,
	"sampling/sampling_logp_difference/max": 0.3240091800689697,
	"sampling/sampling_logp_difference/mean": 0.023685907945036888,
	"step": 159,
	"step_time": 27.411928095999883
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.08342378272209316,
	"epoch": 0.0032,
	"grad_norm": 0.0198823194950819,
	"kl": 0.08883899757620384,
	"learning_rate": 7.999977220830588e-06,
	"loss": 0.0001,
	"step": 160,
	"step_time": 12.353684361999967
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06228045103489421,
	"epoch": 0.00322,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002511651022359729,
	"kl": 0.1462944263475947,
	"learning_rate": 7.999976851943628e-06,
	"loss": -0.0,
	"num_tokens": 8445224.0,
	"reward": 2.391735076904297,
	"reward_std": 0.3887004256248474,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4645148813724518,
	"rewards/rollout_reward_func/std": 0.24512337148189545,
	"sampling/importance_sampling_ratio/max": 1.2499885559082031,
	"sampling/importance_sampling_ratio/mean": 0.964512288570404,
	"sampling/importance_sampling_ratio/min": 0.2849932909011841,
	"sampling/sampling_logp_difference/max": 1.2552961111068726,
	"sampling/sampling_logp_difference/mean": 0.02673853561282158,
	"step": 161,
	"step_time": 26.90330324300021
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07126606599922525,
	"epoch": 0.00324,
	"grad_norm": 0.00517527898773551,
	"kl": 0.13863739833080524,
	"learning_rate": 7.999976480093737e-06,
	"loss": -0.0,
	"step": 162,
	"step_time": 11.688447676000578
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07072257142863236,
	"epoch": 0.00326,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004865987226366997,
	"kl": 0.1391429503753443,
	"learning_rate": 7.999976105280914e-06,
	"loss": -0.0,
	"num_tokens": 8551746.0,
	"reward": 2.3334262371063232,
	"reward_std": 0.42871803045272827,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9307848215103149,
	"rewards/probe_shaping_dominance/std": 0.1679239422082901,
	"rewards/probe_terminal_raw/mean": 0.07113821059465408,
	"rewards/probe_terminal_raw/std": 0.1717527210712433,
	"rewards/rollout_reward_func/mean": -0.5247467756271362,
	"rewards/rollout_reward_func/std": 0.24572212994098663,
	"sampling/importance_sampling_ratio/max": 1.3134804964065552,
	"sampling/importance_sampling_ratio/mean": 1.0010151863098145,
	"sampling/importance_sampling_ratio/min": 0.42815467715263367,
	"sampling/sampling_logp_difference/max": 0.8482714891433716,
	"sampling/sampling_logp_difference/mean": 0.01988227292895317,
	"step": 163,
	"step_time": 28.07267034399956
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07096506169182248,
	"epoch": 0.00328,
	"grad_norm": 0.004104274325072765,
	"kl": 0.13155441358685493,
	"learning_rate": 7.99997572750516e-06,
	"loss": -0.0,
	"step": 164,
	"step_time": 11.647160391999023
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.078909770467817,
	"epoch": 0.0033,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004251962527632713,
	"kl": 0.09027766038946083,
	"learning_rate": 7.999975346766472e-06,
	"loss": -0.0,
	"num_tokens": 8658732.0,
	"reward": 2.414771795272827,
	"reward_std": 0.3757838010787964,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9784373044967651,
	"rewards/probe_shaping_dominance/std": 0.08622659742832184,
	"rewards/probe_terminal_raw/mean": 0.024517275393009186,
	"rewards/probe_terminal_raw/std": 0.10027948766946793,
	"rewards/rollout_reward_func/mean": -0.47568273544311523,
	"rewards/rollout_reward_func/std": 0.19167323410511017,
	"sampling/importance_sampling_ratio/max": 1.1542701721191406,
	"sampling/importance_sampling_ratio/mean": 0.9669894576072693,
	"sampling/importance_sampling_ratio/min": 0.6857547163963318,
	"sampling/sampling_logp_difference/max": 0.37537309527397156,
	"sampling/sampling_logp_difference/mean": 0.017938656732439995,
	"step": 165,
	"step_time": 27.2606650000007
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.07565992117088172,
	"epoch": 0.00332,
	"grad_norm": 0.006961170118302107,
	"kl": 0.08890455095081506,
	"learning_rate": 7.999974963064855e-06,
	"loss": -0.0,
	"step": 166,
	"step_time": 11.698157390000233
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07434030482545495,
	"epoch": 0.00334,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004832255654036999,
	"kl": 0.15626501338783783,
	"learning_rate": 7.999974576400308e-06,
	"loss": -0.0,
	"num_tokens": 8765380.0,
	"reward": 2.2938361167907715,
	"reward_std": 0.4383181631565094,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.960378885269165,
	"rewards/probe_shaping_dominance/std": 0.12596461176872253,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.5071678757667542,
	"rewards/rollout_reward_func/std": 0.2304636836051941,
	"sampling/importance_sampling_ratio/max": 1.6727243661880493,
	"sampling/importance_sampling_ratio/mean": 1.0108327865600586,
	"sampling/importance_sampling_ratio/min": 0.4802703857421875,
	"sampling/sampling_logp_difference/max": 0.737343966960907,
	"sampling/sampling_logp_difference/mean": 0.023180868476629257,
	"step": 167,
	"step_time": 28.38192438599981
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.0768249062821269,
	"epoch": 0.00336,
	"grad_norm": 0.0052077267318964005,
	"kl": 0.15163502033101395,
	"learning_rate": 7.999974186772832e-06,
	"loss": -0.0,
	"step": 168,
	"step_time": 11.745391591000953
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11408041534014046,
	"epoch": 0.00338,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005346886347979307,
	"kl": 0.05663721589365878,
	"learning_rate": 7.999973794182426e-06,
	"loss": 0.0,
	"num_tokens": 8871458.0,
	"reward": 2.347496271133423,
	"reward_std": 0.37117481231689453,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.984038233757019,
	"rewards/probe_shaping_dominance/std": 0.09029316157102585,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.477167010307312,
	"rewards/rollout_reward_func/std": 0.2263534814119339,
	"sampling/importance_sampling_ratio/max": 1.2048081159591675,
	"sampling/importance_sampling_ratio/mean": 0.967424750328064,
	"sampling/importance_sampling_ratio/min": 0.7366955280303955,
	"sampling/sampling_logp_difference/max": 0.3062773644924164,
	"sampling/sampling_logp_difference/mean": 0.022138062864542007,
	"step": 169,
	"step_time": 26.940671711000505
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.11010659678140655,
	"epoch": 0.0034,
	"grad_norm": 0.006358719430863857,
	"kl": 0.05905036644250572,
	"learning_rate": 7.99997339862909e-06,
	"loss": 0.0,
	"step": 170,
	"step_time": 12.187279679998937
	},
	{
	"clip_ratio/high_max": 0.06666666828095913,
	"clip_ratio/high_mean": 0.033333334140479565,
	"clip_ratio/low_mean": 0.035416667349636555,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.06875000149011612,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10378921253141016,
	"epoch": 0.00342,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004866220988333225,
	"kl": 0.3513250324758701,
	"learning_rate": 7.999973000112826e-06,
	"loss": -0.0,
	"num_tokens": 8977121.0,
	"reward": 2.3662233352661133,
	"reward_std": 0.36591798067092896,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.490026593208313,
	"rewards/rollout_reward_func/std": 0.1713269054889679,
	"sampling/importance_sampling_ratio/max": 2.4813146591186523,
	"sampling/importance_sampling_ratio/mean": 1.0544798374176025,
	"sampling/importance_sampling_ratio/min": 0.5539883375167847,
	"sampling/sampling_logp_difference/max": 0.9087880849838257,
	"sampling/sampling_logp_difference/mean": 0.04017889127135277,
	"step": 171,
	"step_time": 27.655318435999106
	},
	{
	"clip_ratio/high_max": 0.06666666828095913,
	"clip_ratio/high_mean": 0.033333334140479565,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04375000111758709,
	"entropy": 0.10798206774052233,
	"epoch": 0.00344,
	"grad_norm": 0.012118767946958542,
	"kl": 0.39312139721005224,
	"learning_rate": 7.999972598633632e-06,
	"loss": -0.0,
	"step": 172,
	"step_time": 11.631308623997938
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06574001582339406,
	"epoch": 0.00346,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004147569183260202,
	"kl": 0.01835462471728988,
	"learning_rate": 7.999972194191514e-06,
	"loss": 0.0001,
	"num_tokens": 9080753.0,
	"reward": 2.3741204738616943,
	"reward_std": 0.33386632800102234,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9828835725784302,
	"rewards/probe_shaping_dominance/std": 0.06857176870107651,
	"rewards/probe_terminal_raw/mean": 0.016895325854420662,
	"rewards/probe_terminal_raw/std": 0.067360520362854,
	"rewards/rollout_reward_func/mean": -0.48190829157829285,
	"rewards/rollout_reward_func/std": 0.23477764427661896,
	"sampling/importance_sampling_ratio/max": 2.0903360843658447,
	"sampling/importance_sampling_ratio/mean": 1.0450650453567505,
	"sampling/importance_sampling_ratio/min": 0.8843300342559814,
	"sampling/sampling_logp_difference/max": 0.7373225688934326,
	"sampling/sampling_logp_difference/mean": 0.01723039150238037,
	"step": 173,
	"step_time": 26.502221221999207
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.06844324560370296,
	"epoch": 0.00348,
	"grad_norm": 0.0040916260331869125,
	"kl": 0.022212313354311064,
	"learning_rate": 7.999971786786465e-06,
	"loss": 0.0001,
	"step": 174,
	"step_time": 11.897189610000169
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07524242554791272,
	"epoch": 0.0035,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005887735169380903,
	"kl": 0.22349138231948018,
	"learning_rate": 7.99997137641849e-06,
	"loss": -0.0,
	"num_tokens": 9185715.0,
	"reward": 2.4274468421936035,
	"reward_std": 0.30020296573638916,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9728903770446777,
	"rewards/probe_shaping_dominance/std": 0.10675826668739319,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.46419334411621094,
	"rewards/rollout_reward_func/std": 0.211602121591568,
	"sampling/importance_sampling_ratio/max": 1.1768231391906738,
	"sampling/importance_sampling_ratio/mean": 0.9632406830787659,
	"sampling/importance_sampling_ratio/min": 0.32605040073394775,
	"sampling/sampling_logp_difference/max": 1.1148320436477661,
	"sampling/sampling_logp_difference/mean": 0.02662883885204792,
	"step": 175,
	"step_time": 27.599337874999037
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.0704325451515615,
	"epoch": 0.00352,
	"grad_norm": 0.004202236421406269,
	"kl": 0.2313449110952206,
	"learning_rate": 7.999970963087587e-06,
	"loss": -0.0,
	"step": 176,
	"step_time": 11.622392715999013
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.13769991835579276,
	"epoch": 0.00354,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008693602867424488,
	"kl": 0.1878440118744038,
	"learning_rate": 7.99997054679376e-06,
	"loss": -0.0001,
	"num_tokens": 9289277.0,
	"reward": 2.358966588973999,
	"reward_std": 0.3925982713699341,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9915453791618347,
	"rewards/probe_shaping_dominance/std": 0.04782645031809807,
	"rewards/probe_terminal_raw/mean": 0.006986788474023342,
	"rewards/probe_terminal_raw/std": 0.03952324390411377,
	"rewards/rollout_reward_func/mean": -0.4958154261112213,
	"rewards/rollout_reward_func/std": 0.18107342720031738,
	"sampling/importance_sampling_ratio/max": 1.5426419973373413,
	"sampling/importance_sampling_ratio/mean": 0.9988285303115845,
	"sampling/importance_sampling_ratio/min": 0.43416687846183777,
	"sampling/sampling_logp_difference/max": 0.5040676593780518,
	"sampling/sampling_logp_difference/mean": 0.04200742021203041,
	"step": 177,
	"step_time": 27.01749301200016
	},
	{
	"clip_ratio/high_max": 0.06666666828095913,
	"clip_ratio/high_mean": 0.033333334140479565,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.04375000111758709,
	"entropy": 0.13739392068237066,
	"epoch": 0.00356,
	"grad_norm": 0.00472621712833643,
	"kl": 0.1952200917294249,
	"learning_rate": 7.999970127537005e-06,
	"loss": -0.0001,
	"step": 178,
	"step_time": 12.335309556999164
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07692393008619547,
	"epoch": 0.00358,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.028508227318525314,
	"kl": 5.385302404543381,
	"learning_rate": 7.999969705317325e-06,
	"loss": 0.0001,
	"num_tokens": 9389166.0,
	"reward": 2.4562783241271973,
	"reward_std": 0.2598528265953064,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4624716639518738,
	"rewards/rollout_reward_func/std": 0.18222831189632416,
	"sampling/importance_sampling_ratio/max": 1.1737666130065918,
	"sampling/importance_sampling_ratio/mean": 0.9517749547958374,
	"sampling/importance_sampling_ratio/min": 0.2871549129486084,
	"sampling/sampling_logp_difference/max": 1.2477340698242188,
	"sampling/sampling_logp_difference/mean": 0.038661930710077286,
	"step": 179,
	"step_time": 26.827888970999993
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.08128447085618973,
	"epoch": 0.0036,
	"grad_norm": 0.00963876023888588,
	"kl": 2.0179060684172327,
	"learning_rate": 7.99996928013472e-06,
	"loss": 0.0001,
	"step": 180,
	"step_time": 11.37347329900058
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10767973656766117,
	"epoch": 0.00362,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010538225993514061,
	"kl": 1.1300342498579994,
	"learning_rate": 7.999968851989192e-06,
	"loss": 0.0,
	"num_tokens": 9494689.0,
	"reward": 2.297545909881592,
	"reward_std": 0.3879827558994293,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.98197340965271,
	"rewards/probe_shaping_dominance/std": 0.10197389870882034,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5250524282455444,
	"rewards/rollout_reward_func/std": 0.19950900971889496,
	"sampling/importance_sampling_ratio/max": 1.4360560178756714,
	"sampling/importance_sampling_ratio/mean": 0.9875404834747314,
	"sampling/importance_sampling_ratio/min": 0.18539370596408844,
	"sampling/sampling_logp_difference/max": 1.6852741241455078,
	"sampling/sampling_logp_difference/mean": 0.049665287137031555,
	"step": 181,
	"step_time": 26.69406858900038
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.11389242531731725,
	"epoch": 0.00364,
	"grad_norm": 0.003970544785261154,
	"kl": 0.6293696188367903,
	"learning_rate": 7.999968420880736e-06,
	"loss": 0.0,
	"step": 182,
	"step_time": 12.197639549000996
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09248453052714467,
	"epoch": 0.00366,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0045459093526005745,
	"kl": 0.13789485239249188,
	"learning_rate": 7.99996798680936e-06,
	"loss": -0.0001,
	"num_tokens": 9599380.0,
	"reward": 2.4226768016815186,
	"reward_std": 0.3249405324459076,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9769073724746704,
	"rewards/probe_shaping_dominance/std": 0.09389247745275497,
	"rewards/probe_terminal_raw/mean": 0.024263210594654083,
	"rewards/probe_terminal_raw/std": 0.09960746020078659,
	"rewards/rollout_reward_func/mean": -0.4659937620162964,
	"rewards/rollout_reward_func/std": 0.19758032262325287,
	"sampling/importance_sampling_ratio/max": 1.1653671264648438,
	"sampling/importance_sampling_ratio/mean": 0.9370558261871338,
	"sampling/importance_sampling_ratio/min": 0.46233388781547546,
	"sampling/sampling_logp_difference/max": 0.7714686393737793,
	"sampling/sampling_logp_difference/mean": 0.038370583206415176,
	"step": 183,
	"step_time": 26.904572651000308
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.0933451559394598,
	"epoch": 0.00368,
	"grad_norm": 0.004598891828209162,
	"kl": 0.12106670817593113,
	"learning_rate": 7.999967549775057e-06,
	"loss": -0.0001,
	"step": 184,
	"step_time": 11.607436572001461
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.15387224033474922,
	"epoch": 0.0037,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.010245956480503082,
	"kl": 0.5030446688178927,
	"learning_rate": 7.999967109777834e-06,
	"loss": -0.0,
	"num_tokens": 9707382.0,
	"reward": 2.4315314292907715,
	"reward_std": 0.47317853569984436,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 0.9855233430862427,
	"rewards/probe_shaping_dominance/std": 0.0818924754858017,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.4883667528629303,
	"rewards/rollout_reward_func/std": 0.20319455862045288,
	"sampling/importance_sampling_ratio/max": 1.2542879581451416,
	"sampling/importance_sampling_ratio/mean": 0.9586943984031677,
	"sampling/importance_sampling_ratio/min": 0.3715563118457794,
	"sampling/sampling_logp_difference/max": 0.9900554418563843,
	"sampling/sampling_logp_difference/mean": 0.04447564482688904,
	"step": 185,
	"step_time": 27.28605421100019
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.056250001303851604,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.06875000149011612,
	"entropy": 0.15163114294409752,
	"epoch": 0.00372,
	"grad_norm": 0.0044283876195549965,
	"kl": 0.7128359689377248,
	"learning_rate": 7.999966666817687e-06,
	"loss": -0.0,
	"step": 186,
	"step_time": 12.221499876998678
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1375539805740118,
	"epoch": 0.00374,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006928480230271816,
	"kl": 0.14416655764216557,
	"learning_rate": 7.999966220894617e-06,
	"loss": -0.0,
	"num_tokens": 9814422.0,
	"reward": 2.40926456451416,
	"reward_std": 0.47349250316619873,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9848357439041138,
	"rewards/probe_shaping_dominance/std": 0.08578190207481384,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.3849461078643799,
	"rewards/rollout_reward_func/std": 0.28888392448425293,
	"sampling/importance_sampling_ratio/max": 1.243560791015625,
	"sampling/importance_sampling_ratio/mean": 0.9681116342544556,
	"sampling/importance_sampling_ratio/min": 0.665830671787262,
	"sampling/sampling_logp_difference/max": 0.37914347648620605,
	"sampling/sampling_logp_difference/mean": 0.03084658458828926,
	"step": 187,
	"step_time": 28.931869071998335
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.13996880408376455,
	"epoch": 0.00376,
	"grad_norm": 0.009369016624987125,
	"kl": 0.15229893615469337,
	"learning_rate": 7.999965772008627e-06,
	"loss": -0.0,
	"step": 188,
	"step_time": 11.766830096999001
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.035416667349636555,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10358174092834815,
	"epoch": 0.00378,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.03559152036905289,
	"kl": 0.39252137734001735,
	"learning_rate": 7.999965320159715e-06,
	"loss": 0.0,
	"num_tokens": 9914246.0,
	"reward": 2.483328342437744,
	"reward_std": 0.3890749216079712,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.950668215751648,
	"rewards/probe_shaping_dominance/std": 0.1378525346517563,
	"rewards/probe_terminal_raw/mean": 0.056783534586429596,
	"rewards/probe_terminal_raw/std": 0.15526829659938812,
	"rewards/rollout_reward_func/mean": -0.44287341833114624,
	"rewards/rollout_reward_func/std": 0.26299041509628296,
	"sampling/importance_sampling_ratio/max": 1.2944039106369019,
	"sampling/importance_sampling_ratio/mean": 0.9779493808746338,
	"sampling/importance_sampling_ratio/min": 0.5075531005859375,
	"sampling/sampling_logp_difference/max": 0.6781981587409973,
	"sampling/sampling_logp_difference/mean": 0.026978708803653717,
	"step": 189,
	"step_time": 27.036868832000437
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"entropy": 0.10481282410910353,
	"epoch": 0.0038,
	"grad_norm": 0.0055263713002204895,
	"kl": 0.39047255569312256,
	"learning_rate": 7.999964865347883e-06,
	"loss": 0.0001,
	"step": 190,
	"step_time": 11.940458628999295
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.1198381851427257,
	"epoch": 0.00382,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0030459309928119183,
	"kl": 0.34787876208429225,
	"learning_rate": 7.999964407573131e-06,
	"loss": 0.0,
	"num_tokens": 10017338.0,
	"reward": 2.2820868492126465,
	"reward_std": 0.4749685525894165,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.966022253036499,
	"rewards/probe_shaping_dominance/std": 0.13379566371440887,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.47768545150756836,
	"rewards/rollout_reward_func/std": 0.2837761640548706,
	"sampling/importance_sampling_ratio/max": 1.7857273817062378,
	"sampling/importance_sampling_ratio/mean": 1.0156748294830322,
	"sampling/importance_sampling_ratio/min": 0.514444887638092,
	"sampling/sampling_logp_difference/max": 0.6646687984466553,
	"sampling/sampling_logp_difference/mean": 0.03443087264895439,
	"step": 191,
	"step_time": 27.4936487089999
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.12180041195824742,
	"epoch": 0.00384,
	"grad_norm": 0.009600832127034664,
	"kl": 0.3496675969581702,
	"learning_rate": 7.999963946835458e-06,
	"loss": 0.0,
	"step": 192,
	"step_time": 11.71437842100022
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07277914439328015,
	"epoch": 0.00386,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005412722937762737,
	"kl": 0.6871760921980012,
	"learning_rate": 7.999963483134866e-06,
	"loss": 0.0001,
	"num_tokens": 10123551.0,
	"reward": 2.4312024116516113,
	"reward_std": 0.31741824746131897,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.954460859298706,
	"rewards/probe_shaping_dominance/std": 0.1439775824546814,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.48888325691223145,
	"rewards/rollout_reward_func/std": 0.2712078392505646,
	"sampling/importance_sampling_ratio/max": 1.8100159168243408,
	"sampling/importance_sampling_ratio/mean": 1.0015695095062256,
	"sampling/importance_sampling_ratio/min": 0.4417291283607483,
	"sampling/sampling_logp_difference/max": 0.817058801651001,
	"sampling/sampling_logp_difference/mean": 0.03453746810555458,
	"step": 193,
	"step_time": 26.960456193001846
	},
	{
	"clip_ratio/high_max": 0.06250000186264515,
	"clip_ratio/high_mean": 0.031250000931322575,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0416666679084301,
	"entropy": 0.07823239883873612,
	"epoch": 0.00388,
	"grad_norm": 0.01935429498553276,
	"kl": 0.6307496229807157,
	"learning_rate": 7.999963016471355e-06,
	"loss": 0.0001,
	"step": 194,
	"step_time": 12.808481609999944
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02500000037252903,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09379608882591128,
	"epoch": 0.0039,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0030668650288134813,
	"kl": 0.2814688477665186,
	"learning_rate": 7.999962546844924e-06,
	"loss": 0.0001,
	"num_tokens": 10225590.0,
	"reward": 2.361347198486328,
	"reward_std": 0.32310429215431213,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9668688178062439,
	"rewards/probe_shaping_dominance/std": 0.13289061188697815,
	"rewards/probe_terminal_raw/mean": 0.028328251093626022,
	"rewards/probe_terminal_raw/std": 0.11210102587938309,
	"rewards/rollout_reward_func/mean": -0.49010002613067627,
	"rewards/rollout_reward_func/std": 0.24613085389137268,
	"sampling/importance_sampling_ratio/max": 1.3004266023635864,
	"sampling/importance_sampling_ratio/mean": 0.9684375524520874,
	"sampling/importance_sampling_ratio/min": 0.5094537734985352,
	"sampling/sampling_logp_difference/max": 0.6744171380996704,
	"sampling/sampling_logp_difference/mean": 0.028151309117674828,
	"step": 195,
	"step_time": 25.599357043000964
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.09113423456437886,
	"epoch": 0.00392,
	"grad_norm": 0.003771732561290264,
	"kl": 0.27635849734906515,
	"learning_rate": 7.999962074255578e-06,
	"loss": 0.0001,
	"step": 196,
	"step_time": 11.204666337999697
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04920864764972066,
	"epoch": 0.00394,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0027844668366014957,
	"kl": 0.3839081407932099,
	"learning_rate": 7.999961598703312e-06,
	"loss": -0.0,
	"num_tokens": 10330063.0,
	"reward": 2.415410041809082,
	"reward_std": 0.4154632091522217,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9844269156455994,
	"rewards/probe_shaping_dominance/std": 0.08809469640254974,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.44089192152023315,
	"rewards/rollout_reward_func/std": 0.2551630139350891,
	"sampling/importance_sampling_ratio/max": 1.1653680801391602,
	"sampling/importance_sampling_ratio/mean": 0.9744973182678223,
	"sampling/importance_sampling_ratio/min": 0.20111165940761566,
	"sampling/sampling_logp_difference/max": 1.6039009094238281,
	"sampling/sampling_logp_difference/mean": 0.030936850234866142,
	"step": 197,
	"step_time": 26.98998093100181
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.0472417699656944,
	"epoch": 0.00396,
	"grad_norm": 0.0009492259123362601,
	"kl": 0.3996036083844956,
	"learning_rate": 7.99996112018813e-06,
	"loss": -0.0,
	"step": 198,
	"step_time": 12.02342930299983
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06072818394750357,
	"epoch": 0.00398,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0025375511031597853,
	"kl": 0.2914491758947406,
	"learning_rate": 7.999960638710032e-06,
	"loss": 0.0,
	"num_tokens": 10431419.0,
	"reward": 2.499394178390503,
	"reward_std": 0.29632288217544556,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9628201127052307,
	"rewards/probe_shaping_dominance/std": 0.12069481611251831,
	"rewards/probe_terminal_raw/mean": 0.04026930779218674,
	"rewards/probe_terminal_raw/std": 0.13092826306819916,
	"rewards/rollout_reward_func/mean": -0.42244523763656616,
	"rewards/rollout_reward_func/std": 0.24739933013916016,
	"sampling/importance_sampling_ratio/max": 1.3507100343704224,
	"sampling/importance_sampling_ratio/mean": 1.0147151947021484,
	"sampling/importance_sampling_ratio/min": 0.9091832637786865,
	"sampling/sampling_logp_difference/max": 0.338870108127594,
	"sampling/sampling_logp_difference/mean": 0.010294873267412186,
	"step": 199,
	"step_time": 27.086743224999736
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.058446566108614206,
	"epoch": 0.004,
	"grad_norm": 0.0024834321811795235,
	"kl": 0.2936624846115592,
	"learning_rate": 7.999960154269017e-06,
	"loss": 0.0,
	"step": 200,
	"step_time": 11.463394613998389
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09987628925591707,
	"epoch": 0.00402,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0032795185688883066,
	"kl": 0.33637799334246665,
	"learning_rate": 7.999959666865086e-06,
	"loss": -0.0,
	"num_tokens": 10533498.0,
	"reward": 2.4651217460632324,
	"reward_std": 0.32078394293785095,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9926146268844604,
	"rewards/probe_shaping_dominance/std": 0.04177792742848396,
	"rewards/probe_terminal_raw/mean": 0.008003048598766327,
	"rewards/probe_terminal_raw/std": 0.04527207836508751,
	"rewards/rollout_reward_func/mean": -0.4229958653450012,
	"rewards/rollout_reward_func/std": 0.19672146439552307,
	"sampling/importance_sampling_ratio/max": 1.195106863975525,
	"sampling/importance_sampling_ratio/mean": 0.9418940544128418,
	"sampling/importance_sampling_ratio/min": 0.318993479013443,
	"sampling/sampling_logp_difference/max": 0.9258831739425659,
	"sampling/sampling_logp_difference/mean": 0.038004204630851746,
	"step": 201,
	"step_time": 26.624555751001026
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.10252866102382541,
	"epoch": 0.00404,
	"grad_norm": 0.0035051219165325165,
	"kl": 0.3395325805176981,
	"learning_rate": 7.99995917649824e-06,
	"loss": -0.0,
	"step": 202,
	"step_time": 12.736442242999146
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10787439718842506,
	"epoch": 0.00406,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00344108697026968,
	"kl": 0.40611333276319783,
	"learning_rate": 7.999958683168479e-06,
	"loss": 0.0,
	"num_tokens": 10637062.0,
	"reward": 2.5038881301879883,
	"reward_std": 0.22744759917259216,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4148617684841156,
	"rewards/rollout_reward_func/std": 0.18833006918430328,
	"sampling/importance_sampling_ratio/max": 1.1548116207122803,
	"sampling/importance_sampling_ratio/mean": 0.9753589630126953,
	"sampling/importance_sampling_ratio/min": 0.7033773064613342,
	"sampling/sampling_logp_difference/max": 0.35186219215393066,
	"sampling/sampling_logp_difference/mean": 0.019522543996572495,
	"step": 203,
	"step_time": 26.715982574999543
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.1049330742098391,
	"epoch": 0.00408,
	"grad_norm": 0.0019796311389654875,
	"kl": 0.4593061124905944,
	"learning_rate": 7.999958186875805e-06,
	"loss": -0.0,
	"step": 204,
	"step_time": 11.646448757999678
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 3.0,
	"completions/mean_terminated_length": 3.0,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 0.055373367242282256,
	"epoch": 0.0041,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006926523055881262,
	"kl": 0.05150494979155518,
	"learning_rate": 7.999957687620215e-06,
	"loss": -0.0,
	"num_tokens": 10738428.0,
	"reward": 2.550138473510742,
	"reward_std": 0.22538912296295166,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 2.0,
	"rewards/probe_completion_length/std": 0.0,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9910282492637634,
	"rewards/probe_shaping_dominance/std": 0.05075191706418991,
	"rewards/probe_terminal_raw/mean": 0.00940040685236454,
	"rewards/probe_terminal_raw/std": 0.05317673459649086,
	"rewards/rollout_reward_func/mean": -0.4002901315689087,
	"rewards/rollout_reward_func/std": 0.22546610236167908,
	"sampling/importance_sampling_ratio/max": 1.2517437934875488,
	"sampling/importance_sampling_ratio/mean": 0.9799097180366516,
	"sampling/importance_sampling_ratio/min": 0.5997620224952698,
	"sampling/sampling_logp_difference/max": 0.5112212896347046,
	"sampling/sampling_logp_difference/mean": 0.01782449334859848,
	"step": 205,
	"step_time": 26.197072295999533
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.06316551179043017,
	"epoch": 0.00412,
	"grad_norm": 0.0017257543513551354,
	"kl": 0.053950335964449536,
	"learning_rate": 7.999957185401714e-06,
	"loss": -0.0,
	"step": 206,
	"step_time": 12.549622151999756
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11197321023792028,
	"epoch": 0.00414,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.031097499653697014,
	"kl": 3.89400917571038,
	"learning_rate": 7.9999566802203e-06,
	"loss": 0.0001,
	"num_tokens": 10840689.0,
	"reward": 2.345735549926758,
	"reward_std": 0.5137441754341125,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 0.9725180268287659,
	"rewards/probe_shaping_dominance/std": 0.10821773111820221,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.5142826437950134,
	"rewards/rollout_reward_func/std": 0.19152681529521942,
	"sampling/importance_sampling_ratio/max": 1.9924818277359009,
	"sampling/importance_sampling_ratio/mean": 0.9943416118621826,
	"sampling/importance_sampling_ratio/min": 0.39203470945358276,
	"sampling/sampling_logp_difference/max": 0.9361467361450195,
	"sampling/sampling_logp_difference/mean": 0.053312450647354126,
	"step": 207,
	"step_time": 26.643122880999726
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.035416667349636555,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.058333334513008595,
	"entropy": 0.11182145914062858,
	"epoch": 0.00416,
	"grad_norm": 0.007240073289722204,
	"kl": 1.6443076208233833,
	"learning_rate": 7.999956172075974e-06,
	"loss": 0.0,
	"step": 208,
	"step_time": 11.64378536300228
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.12927352613769472,
	"epoch": 0.00418,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004229975864291191,
	"kl": 0.6016647743063004,
	"learning_rate": 7.999955660968735e-06,
	"loss": -0.0,
	"num_tokens": 10944113.0,
	"reward": 2.364624261856079,
	"reward_std": 0.36824679374694824,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.49162572622299194,
	"rewards/rollout_reward_func/std": 0.21871674060821533,
	"sampling/importance_sampling_ratio/max": 1.3223011493682861,
	"sampling/importance_sampling_ratio/mean": 0.9632259607315063,
	"sampling/importance_sampling_ratio/min": 0.3602616786956787,
	"sampling/sampling_logp_difference/max": 0.6850378513336182,
	"sampling/sampling_logp_difference/mean": 0.04301746189594269,
	"step": 209,
	"step_time": 26.264724693000062
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.12698473082855344,
	"epoch": 0.0042,
	"grad_norm": 0.004611098673194647,
	"kl": 0.6409582832593514,
	"learning_rate": 7.999955146898586e-06,
	"loss": -0.0001,
	"step": 210,
	"step_time": 12.728916892999223
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04340869339648634,
	"epoch": 0.00422,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0008402821840718389,
	"kl": 0.035792879805057964,
	"learning_rate": 7.999954629865525e-06,
	"loss": -0.0,
	"num_tokens": 11047946.0,
	"reward": 2.3281283378601074,
	"reward_std": 0.43589621782302856,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9857661724090576,
	"rewards/probe_shaping_dominance/std": 0.08051877468824387,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.4982629418373108,
	"rewards/rollout_reward_func/std": 0.20851053297519684,
	"sampling/importance_sampling_ratio/max": 1.0012203454971313,
	"sampling/importance_sampling_ratio/mean": 0.9677799940109253,
	"sampling/importance_sampling_ratio/min": 0.4670157730579376,
	"sampling/sampling_logp_difference/max": 0.7613925933837891,
	"sampling/sampling_logp_difference/mean": 0.014532409608364105,
	"step": 211,
	"step_time": 26.491312149000805
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.04452452051918954,
	"epoch": 0.00424,
	"grad_norm": 0.0009245733381249011,
	"kl": 0.039327465879523515,
	"learning_rate": 7.999954109869554e-06,
	"loss": -0.0,
	"step": 212,
	"step_time": 11.690953868999713
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10158436209894717,
	"epoch": 0.00426,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0037761996500194073,
	"kl": 0.43266808055341244,
	"learning_rate": 7.999953586910674e-06,
	"loss": -0.0,
	"num_tokens": 11155145.0,
	"reward": 2.33209490776062,
	"reward_std": 0.3974522352218628,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9518005847930908,
	"rewards/probe_shaping_dominance/std": 0.15248610079288483,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.4603305459022522,
	"rewards/rollout_reward_func/std": 0.2795467674732208,
	"sampling/importance_sampling_ratio/max": 1.5568536520004272,
	"sampling/importance_sampling_ratio/mean": 1.0121254920959473,
	"sampling/importance_sampling_ratio/min": 0.6084503531455994,
	"sampling/sampling_logp_difference/max": 0.49602431058883667,
	"sampling/sampling_logp_difference/mean": 0.017653338611125946,
	"step": 213,
	"step_time": 26.773649626000406
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.10326709412038326,
	"epoch": 0.00428,
	"grad_norm": 0.004299989901483059,
	"kl": 0.4246340822428465,
	"learning_rate": 7.999953060988884e-06,
	"loss": 0.0,
	"step": 214,
	"step_time": 12.393828191000466
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10672931908629835,
	"epoch": 0.0043,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0029812948778271675,
	"kl": 0.5036190063692629,
	"learning_rate": 7.999952532104185e-06,
	"loss": 0.0,
	"num_tokens": 11256499.0,
	"reward": 2.3668174743652344,
	"reward_std": 0.4220028221607208,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.984410285949707,
	"rewards/probe_shaping_dominance/std": 0.08818867057561874,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.458217978477478,
	"rewards/rollout_reward_func/std": 0.1993042230606079,
	"sampling/importance_sampling_ratio/max": 1.2048288583755493,
	"sampling/importance_sampling_ratio/mean": 0.9700103998184204,
	"sampling/importance_sampling_ratio/min": 0.2804865837097168,
	"sampling/sampling_logp_difference/max": 1.2170777320861816,
	"sampling/sampling_logp_difference/mean": 0.027440235018730164,
	"step": 215,
	"step_time": 26.241349470000387
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.10663612652570009,
	"epoch": 0.00432,
	"grad_norm": 0.0025962339714169502,
	"kl": 0.514960631611757,
	"learning_rate": 7.99995200025658e-06,
	"loss": 0.0,
	"step": 216,
	"step_time": 11.455195212000945
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.13120519556105137,
	"epoch": 0.00434,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00685320096090436,
	"kl": 0.5306107758951839,
	"learning_rate": 7.999951465446065e-06,
	"loss": 0.0,
	"num_tokens": 11358760.0,
	"reward": 2.4137301445007324,
	"reward_std": 0.38182157278060913,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9719860553741455,
	"rewards/probe_shaping_dominance/std": 0.1105431467294693,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.4770059883594513,
	"rewards/rollout_reward_func/std": 0.27089011669158936,
	"sampling/importance_sampling_ratio/max": 1.8946123123168945,
	"sampling/importance_sampling_ratio/mean": 1.0106232166290283,
	"sampling/importance_sampling_ratio/min": 0.6873172521591187,
	"sampling/sampling_logp_difference/max": 0.6602880954742432,
	"sampling/sampling_logp_difference/mean": 0.026765936985611916,
	"step": 217,
	"step_time": 28.19653884499894
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.1328302058391273,
	"epoch": 0.00436,
	"grad_norm": 0.006467514205724001,
	"kl": 0.5236879177391529,
	"learning_rate": 7.999950927672645e-06,
	"loss": 0.0,
	"step": 218,
	"step_time": 11.548230411000986
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0558876832947135,
	"epoch": 0.00438,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002873801626265049,
	"kl": 0.43705418131622764,
	"learning_rate": 7.999950386936317e-06,
	"loss": 0.0001,
	"num_tokens": 11459134.0,
	"reward": 2.4926953315734863,
	"reward_std": 0.2576614320278168,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9532788395881653,
	"rewards/probe_shaping_dominance/std": 0.1259964108467102,
	"rewards/probe_terminal_raw/mean": 0.049288615584373474,
	"rewards/probe_terminal_raw/std": 0.13439743220806122,
	"rewards/rollout_reward_func/mean": -0.4286222755908966,
	"rewards/rollout_reward_func/std": 0.13808076083660126,
	"sampling/importance_sampling_ratio/max": 2.167020320892334,
	"sampling/importance_sampling_ratio/mean": 1.0488494634628296,
	"sampling/importance_sampling_ratio/min": 0.5981054306030273,
	"sampling/sampling_logp_difference/max": 0.773352861404419,
	"sampling/sampling_logp_difference/mean": 0.021742573007941246,
	"step": 219,
	"step_time": 26.59079552000003
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.05215576570481062,
	"epoch": 0.0044,
	"grad_norm": 0.013386573642492294,
	"kl": 0.4328960892962641,
	"learning_rate": 7.999949843237083e-06,
	"loss": 0.0001,
	"step": 220,
	"step_time": 11.575578054999824
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 3.0,
	"completions/mean_terminated_length": 3.0,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 0.10125815495848656,
	"epoch": 0.00442,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003916088026016951,
	"kl": 0.22720737754934817,
	"learning_rate": 7.999949296574944e-06,
	"loss": 0.0,
	"num_tokens": 11564110.0,
	"reward": 2.5024495124816895,
	"reward_std": 0.21472422778606415,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 2.0,
	"rewards/probe_completion_length/std": 0.0,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9687739014625549,
	"rewards/probe_shaping_dominance/std": 0.12289554625749588,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.4475744962692261,
	"rewards/rollout_reward_func/std": 0.21471136808395386,
	"sampling/importance_sampling_ratio/max": 1.2565096616744995,
	"sampling/importance_sampling_ratio/mean": 0.9851142168045044,
	"sampling/importance_sampling_ratio/min": 0.7785980701446533,
	"sampling/sampling_logp_difference/max": 0.25026071071624756,
	"sampling/sampling_logp_difference/mean": 0.014336168766021729,
	"step": 221,
	"step_time": 28.308517722000943
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.10386610007844865,
	"epoch": 0.00444,
	"grad_norm": 0.0038715400733053684,
	"kl": 0.2309217918664217,
	"learning_rate": 7.9999487469499e-06,
	"loss": 0.0,
	"step": 222,
	"step_time": 11.59164219199829
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08839935716241598,
	"epoch": 0.00446,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0031392991077154875,
	"kl": 0.3969584498627228,
	"learning_rate": 7.999948194361951e-06,
	"loss": 0.0,
	"num_tokens": 11670791.0,
	"reward": 2.504007339477539,
	"reward_std": 0.40813401341438293,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.4459925591945648,
	"rewards/rollout_reward_func/std": 0.22923637926578522,
	"sampling/importance_sampling_ratio/max": 1.2424126863479614,
	"sampling/importance_sampling_ratio/mean": 1.0054875612258911,
	"sampling/importance_sampling_ratio/min": 0.8022926449775696,
	"sampling/sampling_logp_difference/max": 0.2571254372596741,
	"sampling/sampling_logp_difference/mean": 0.01522812806069851,
	"step": 223,
	"step_time": 27.01184939599989
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.08901654137298465,
	"epoch": 0.00448,
	"grad_norm": 0.0026675413828343153,
	"kl": 0.3970091380215308,
	"learning_rate": 7.999947638811098e-06,
	"loss": 0.0,
	"step": 224,
	"step_time": 12.880684480999662
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06860345043241978,
	"epoch": 0.0045,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005898882634937763,
	"kl": 0.2994147054851055,
	"learning_rate": 7.999947080297344e-06,
	"loss": 0.0001,
	"num_tokens": 11778059.0,
	"reward": 2.442521095275879,
	"reward_std": 0.44092267751693726,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.3535533845424652,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5387288331985474,
	"rewards/rollout_reward_func/std": 0.17624567449092865,
	"sampling/importance_sampling_ratio/max": 1.9132263660430908,
	"sampling/importance_sampling_ratio/mean": 1.0267926454544067,
	"sampling/importance_sampling_ratio/min": 0.2760489583015442,
	"sampling/sampling_logp_difference/max": 1.2855275869369507,
	"sampling/sampling_logp_difference/mean": 0.03292452543973923,
	"step": 225,
	"step_time": 26.894577987999583
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.0715375836007297,
	"epoch": 0.00452,
	"grad_norm": 0.004127421882003546,
	"kl": 0.2991956745972857,
	"learning_rate": 7.999946518820686e-06,
	"loss": 0.0001,
	"step": 226,
	"step_time": 11.7451522450001
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07084862189367414,
	"epoch": 0.00454,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.007534612435847521,
	"kl": 0.3083134523330955,
	"learning_rate": 7.999945954381125e-06,
	"loss": -0.0,
	"num_tokens": 11885416.0,
	"reward": 2.2896175384521484,
	"reward_std": 0.4199885129928589,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9857305884361267,
	"rewards/probe_shaping_dominance/std": 0.080719955265522,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.5367380380630493,
	"rewards/rollout_reward_func/std": 0.2644577920436859,
	"sampling/importance_sampling_ratio/max": 1.2167645692825317,
	"sampling/importance_sampling_ratio/mean": 0.9729256629943848,
	"sampling/importance_sampling_ratio/min": 0.5702285766601562,
	"sampling/sampling_logp_difference/max": 0.556563138961792,
	"sampling/sampling_logp_difference/mean": 0.01854308322072029,
	"step": 227,
	"step_time": 26.478597906999312
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07101618498563766,
	"epoch": 0.00456,
	"grad_norm": 0.005244475323706865,
	"kl": 0.275350460462505,
	"learning_rate": 7.999945386978663e-06,
	"loss": -0.0,
	"step": 228,
	"step_time": 12.815234450999014
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.10753743472741917,
	"epoch": 0.00458,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002793548395857215,
	"kl": 0.3363812413687519,
	"learning_rate": 7.999944816613299e-06,
	"loss": 0.0,
	"num_tokens": 11990346.0,
	"reward": 2.4647884368896484,
	"reward_std": 0.3218696117401123,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9882341623306274,
	"rewards/probe_shaping_dominance/std": 0.06655776500701904,
	"rewards/probe_terminal_raw/mean": 0.011559959501028061,
	"rewards/probe_terminal_raw/std": 0.06539300829172134,
	"rewards/rollout_reward_func/mean": -0.45375561714172363,
	"rewards/rollout_reward_func/std": 0.26721474528312683,
	"sampling/importance_sampling_ratio/max": 1.7522544860839844,
	"sampling/importance_sampling_ratio/mean": 1.0056817531585693,
	"sampling/importance_sampling_ratio/min": 0.39151322841644287,
	"sampling/sampling_logp_difference/max": 0.9377517700195312,
	"sampling/sampling_logp_difference/mean": 0.030310627073049545,
	"step": 229,
	"step_time": 26.652824122999846
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.10178712871856987,
	"epoch": 0.0046,
	"grad_norm": 0.0023058054503053427,
	"kl": 0.3472972925131521,
	"learning_rate": 7.999944243285035e-06,
	"loss": 0.0,
	"step": 230,
	"step_time": 11.641791465999631
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.012500000186264515,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11396907176822424,
	"epoch": 0.00462,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0053448486141860485,
	"kl": 0.23751085135154426,
	"learning_rate": 7.999943666993872e-06,
	"loss": -0.0,
	"num_tokens": 12094123.0,
	"reward": 2.3231983184814453,
	"reward_std": 0.4537913501262665,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9484962224960327,
	"rewards/probe_shaping_dominance/std": 0.14161793887615204,
	"rewards/probe_terminal_raw/mean": 0.05538617819547653,
	"rewards/probe_terminal_raw/std": 0.15303537249565125,
	"rewards/rollout_reward_func/mean": -0.4744342267513275,
	"rewards/rollout_reward_func/std": 0.27888038754463196,
	"sampling/importance_sampling_ratio/max": 1.2306643724441528,
	"sampling/importance_sampling_ratio/mean": 0.9789013862609863,
	"sampling/importance_sampling_ratio/min": 0.5588669180870056,
	"sampling/sampling_logp_difference/max": 0.5087692737579346,
	"sampling/sampling_logp_difference/mean": 0.027260489761829376,
	"step": 231,
	"step_time": 27.108853302998796
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.11494649667292833,
	"epoch": 0.00464,
	"grad_norm": 0.0034225336275994778,
	"kl": 0.2446515706833452,
	"learning_rate": 7.999943087739808e-06,
	"loss": -0.0,
	"step": 232,
	"step_time": 12.437156906999007
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09780422016046941,
	"epoch": 0.00466,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00331493909470737,
	"kl": 0.29221273493021727,
	"learning_rate": 7.999942505522845e-06,
	"loss": 0.0,
	"num_tokens": 12202392.0,
	"reward": 2.31793212890625,
	"reward_std": 0.4711916446685791,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9865642786026001,
	"rewards/probe_shaping_dominance/std": 0.07600414007902145,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.44675713777542114,
	"rewards/rollout_reward_func/std": 0.27934086322784424,
	"sampling/importance_sampling_ratio/max": 1.2045822143554688,
	"sampling/importance_sampling_ratio/mean": 0.9702666997909546,
	"sampling/importance_sampling_ratio/min": 0.5390675067901611,
	"sampling/sampling_logp_difference/max": 0.6179147958755493,
	"sampling/sampling_logp_difference/mean": 0.02464653179049492,
	"step": 233,
	"step_time": 27.07101158400019
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.10010742908343673,
	"epoch": 0.00468,
	"grad_norm": 0.00394394900649786,
	"kl": 0.28515962581150234,
	"learning_rate": 7.999941920342986e-06,
	"loss": 0.0,
	"step": 234,
	"step_time": 11.908877233997373
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09357268398161978,
	"epoch": 0.0047,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003001472679898143,
	"kl": 0.4120303535989933,
	"learning_rate": 7.999941332200228e-06,
	"loss": 0.0,
	"num_tokens": 12307473.0,
	"reward": 2.356600761413574,
	"reward_std": 0.39092886447906494,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9900838732719421,
	"rewards/probe_shaping_dominance/std": 0.05609414726495743,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.47410792112350464,
	"rewards/rollout_reward_func/std": 0.2651825547218323,
	"sampling/importance_sampling_ratio/max": 1.2125083208084106,
	"sampling/importance_sampling_ratio/mean": 0.9483182430267334,
	"sampling/importance_sampling_ratio/min": 0.5642846822738647,
	"sampling/sampling_logp_difference/max": 0.5796399116516113,
	"sampling/sampling_logp_difference/mean": 0.029487669467926025,
	"step": 235,
	"step_time": 27.473293748998913
	},
	{
	"clip_ratio/high_max": 0.06666666828095913,
	"clip_ratio/high_mean": 0.033333334140479565,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.08885149616980925,
	"epoch": 0.00472,
	"grad_norm": 0.004106747917830944,
	"kl": 0.39987785345859805,
	"learning_rate": 7.999940741094573e-06,
	"loss": 0.0,
	"step": 236,
	"step_time": 11.607714889999443
	},
	{
	"clip_ratio/high_max": 0.0416666679084301,
	"clip_ratio/high_mean": 0.02083333395421505,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0416666679084301,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0790116679854691,
	"epoch": 0.00474,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0017510356847196817,
	"kl": 0.49183082331728656,
	"learning_rate": 7.999940147026021e-06,
	"loss": 0.0,
	"num_tokens": 12410261.0,
	"reward": 2.362030029296875,
	"reward_std": 0.48628348112106323,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9550326466560364,
	"rewards/probe_shaping_dominance/std": 0.1423776000738144,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.464877724647522,
	"rewards/rollout_reward_func/std": 0.2927810847759247,
	"sampling/importance_sampling_ratio/max": 1.2767555713653564,
	"sampling/importance_sampling_ratio/mean": 1.0007102489471436,
	"sampling/importance_sampling_ratio/min": 0.5674677491188049,
	"sampling/sampling_logp_difference/max": 0.564541220664978,
	"sampling/sampling_logp_difference/mean": 0.017719101160764694,
	"step": 237,
	"step_time": 26.277223889999732
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.02083333395421505,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.031250000931322575,
	"entropy": 0.08091688924469054,
	"epoch": 0.00476,
	"grad_norm": 0.0037676175124943256,
	"kl": 0.4987390860915184,
	"learning_rate": 7.999939549994574e-06,
	"loss": 0.0,
	"step": 238,
	"step_time": 11.42589379400033
	},
	{
	"clip_ratio/high_max": 0.03125,
	"clip_ratio/high_mean": 0.015625,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.015625,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08572797977831215,
	"epoch": 0.00478,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0028349068015813828,
	"kl": 0.29074460588162765,
	"learning_rate": 7.99993895000023e-06,
	"loss": -0.0001,
	"num_tokens": 12515046.0,
	"reward": 2.3852663040161133,
	"reward_std": 0.48509836196899414,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9448926448822021,
	"rewards/probe_shaping_dominance/std": 0.15274296700954437,
	"rewards/probe_terminal_raw/mean": 0.05525914579629898,
	"rewards/probe_terminal_raw/std": 0.15285103023052216,
	"rewards/rollout_reward_func/mean": -0.43988555669784546,
	"rewards/rollout_reward_func/std": 0.28072717785835266,
	"sampling/importance_sampling_ratio/max": 1.2809064388275146,
	"sampling/importance_sampling_ratio/mean": 0.9681559801101685,
	"sampling/importance_sampling_ratio/min": 0.417494535446167,
	"sampling/sampling_logp_difference/max": 0.8734843134880066,
	"sampling/sampling_logp_difference/mean": 0.02679057978093624,
	"step": 239,
	"step_time": 27.850705083998037
	},
	{
	"clip_ratio/high_max": 0.05208333395421505,
	"clip_ratio/high_mean": 0.026041666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.026041666977107525,
	"entropy": 0.09145444841124117,
	"epoch": 0.0048,
	"grad_norm": 0.003533316310495138,
	"kl": 0.276357589289546,
	"learning_rate": 7.999938347042993e-06,
	"loss": -0.0001,
	"step": 240,
	"step_time": 11.650785684000766
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05321495997486636,
	"epoch": 0.00482,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002123113488778472,
	"kl": 0.1996255109550784,
	"learning_rate": 7.999937741122862e-06,
	"loss": 0.0,
	"num_tokens": 12618608.0,
	"reward": 2.31355619430542,
	"reward_std": 0.3297788202762604,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5426939129829407,
	"rewards/rollout_reward_func/std": 0.22457517683506012,
	"sampling/importance_sampling_ratio/max": 1.1050293445587158,
	"sampling/importance_sampling_ratio/mean": 1.0058460235595703,
	"sampling/importance_sampling_ratio/min": 0.9022819995880127,
	"sampling/sampling_logp_difference/max": 0.10648787021636963,
	"sampling/sampling_logp_difference/mean": 0.005735831335186958,
	"step": 241,
	"step_time": 26.73763404300007
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.05486724083311856,
	"epoch": 0.00484,
	"grad_norm": 0.003093272214755416,
	"kl": 0.1941228064047955,
	"learning_rate": 7.999937132239836e-06,
	"loss": 0.0,
	"step": 242,
	"step_time": 11.670754389999274
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07577884336933494,
	"epoch": 0.00486,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0036162908654659986,
	"kl": 0.4399729967590247,
	"learning_rate": 7.999936520393918e-06,
	"loss": 0.0,
	"num_tokens": 12726447.0,
	"reward": 2.3645379543304443,
	"reward_std": 0.41120022535324097,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9753913879394531,
	"rewards/probe_shaping_dominance/std": 0.09771986305713654,
	"rewards/probe_terminal_raw/mean": 0.0260416679084301,
	"rewards/probe_terminal_raw/std": 0.1046360433101654,
	"rewards/rollout_reward_func/mean": -0.4618951678276062,
	"rewards/rollout_reward_func/std": 0.1977241188287735,
	"sampling/importance_sampling_ratio/max": 1.1149406433105469,
	"sampling/importance_sampling_ratio/mean": 0.9780128002166748,
	"sampling/importance_sampling_ratio/min": 0.7354345321655273,
	"sampling/sampling_logp_difference/max": 0.18633489310741425,
	"sampling/sampling_logp_difference/mean": 0.013524588197469711,
	"step": 243,
	"step_time": 27.977090622001015
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.07133703003637493,
	"epoch": 0.00488,
	"grad_norm": 0.002898427424952388,
	"kl": 0.44227540418796707,
	"learning_rate": 7.999935905585108e-06,
	"loss": 0.0,
	"step": 244,
	"step_time": 11.75723793999805
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0315001527142158,
	"epoch": 0.0049,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.001392417005263269,
	"kl": 0.23886053822934628,
	"learning_rate": 7.999935287813407e-06,
	"loss": -0.0,
	"num_tokens": 12827575.0,
	"reward": 2.4073498249053955,
	"reward_std": 0.42101356387138367,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9554626941680908,
	"rewards/probe_shaping_dominance/std": 0.14310474693775177,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.4199880063533783,
	"rewards/rollout_reward_func/std": 0.2148957997560501,
	"sampling/importance_sampling_ratio/max": 1.0394365787506104,
	"sampling/importance_sampling_ratio/mean": 0.995591402053833,
	"sampling/importance_sampling_ratio/min": 0.8603565096855164,
	"sampling/sampling_logp_difference/max": 0.1303640604019165,
	"sampling/sampling_logp_difference/mean": 0.004159946460276842,
	"step": 245,
	"step_time": 26.077412141000423
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.032327667491699685,
	"epoch": 0.00492,
	"grad_norm": 0.0010727684712037444,
	"kl": 0.23855953469561797,
	"learning_rate": 7.999934667078813e-06,
	"loss": -0.0,
	"step": 246,
	"step_time": 11.513740063000114
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0816163292620331,
	"epoch": 0.00494,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0027896249666810036,
	"kl": 0.4679242782876827,
	"learning_rate": 7.999934043381328e-06,
	"loss": 0.0,
	"num_tokens": 12935730.0,
	"reward": 2.46283221244812,
	"reward_std": 0.36876291036605835,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9373108148574829,
	"rewards/probe_shaping_dominance/std": 0.1728522628545761,
	"rewards/probe_terminal_raw/mean": 0.0625,
	"rewards/probe_terminal_raw/std": 0.16800537705421448,
	"rewards/rollout_reward_func/mean": -0.3932287096977234,
	"rewards/rollout_reward_func/std": 0.24200834333896637,
	"sampling/importance_sampling_ratio/max": 1.2427064180374146,
	"sampling/importance_sampling_ratio/mean": 1.0063412189483643,
	"sampling/importance_sampling_ratio/min": 0.8085158467292786,
	"sampling/sampling_logp_difference/max": 0.21965795755386353,
	"sampling/sampling_logp_difference/mean": 0.01280665211379528,
	"step": 247,
	"step_time": 28.041720137000084
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.08206989825703204,
	"epoch": 0.00496,
	"grad_norm": 0.00293480372056365,
	"kl": 0.46830739825963974,
	"learning_rate": 7.999933416720957e-06,
	"loss": 0.0,
	"step": 248,
	"step_time": 11.713867525000751
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06533269377541728,
	"epoch": 0.00498,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003347411984577775,
	"kl": 0.36843465792230745,
	"learning_rate": 7.999932787097692e-06,
	"loss": 0.0001,
	"num_tokens": 13041381.0,
	"reward": 2.382171630859375,
	"reward_std": 0.4231238067150116,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9549021124839783,
	"rewards/probe_shaping_dominance/std": 0.14646287262439728,
	"rewards/probe_terminal_raw/mean": 0.04255589470267296,
	"rewards/probe_terminal_raw/std": 0.13594815135002136,
	"rewards/rollout_reward_func/mean": -0.50278639793396,
	"rewards/rollout_reward_func/std": 0.27676716446876526,
	"sampling/importance_sampling_ratio/max": 1.3422638177871704,
	"sampling/importance_sampling_ratio/mean": 0.9941832423210144,
	"sampling/importance_sampling_ratio/min": 0.6115661263465881,
	"sampling/sampling_logp_difference/max": 0.4917324185371399,
	"sampling/sampling_logp_difference/mean": 0.018511097878217697,
	"step": 249,
	"step_time": 26.64282015000026
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07043309864820912,
	"epoch": 0.005,
	"grad_norm": 0.0035562312696129084,
	"kl": 0.359963540629451,
	"learning_rate": 7.999932154511542e-06,
	"loss": 0.0,
	"step": 250,
	"step_time": 11.727345789000537
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08174855704419315,
	"epoch": 0.00502,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003543607424944639,
	"kl": 0.5413316028789268,
	"learning_rate": 7.999931518962502e-06,
	"loss": 0.0,
	"num_tokens": 13146021.0,
	"reward": 2.4559497833251953,
	"reward_std": 0.3885264992713928,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9621438384056091,
	"rewards/probe_shaping_dominance/std": 0.1238301619887352,
	"rewards/probe_terminal_raw/mean": 0.03963414579629898,
	"rewards/probe_terminal_raw/std": 0.12972840666770935,
	"rewards/rollout_reward_func/mean": -0.40207818150520325,
	"rewards/rollout_reward_func/std": 0.2555524408817291,
	"sampling/importance_sampling_ratio/max": 1.1064826250076294,
	"sampling/importance_sampling_ratio/mean": 0.954660177230835,
	"sampling/importance_sampling_ratio/min": 0.41962218284606934,
	"sampling/sampling_logp_difference/max": 0.7979011535644531,
	"sampling/sampling_logp_difference/mean": 0.023729108273983,
	"step": 251,
	"step_time": 27.992850227999952
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07990033202804625,
	"epoch": 0.00504,
	"grad_norm": 0.003231135895475745,
	"kl": 0.524783481414488,
	"learning_rate": 7.999930880450575e-06,
	"loss": 0.0,
	"step": 252,
	"step_time": 11.643585757999972
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.07902092937729321,
	"epoch": 0.00506,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006585233379155397,
	"kl": 0.37969694038247326,
	"learning_rate": 7.99993023897576e-06,
	"loss": 0.0,
	"num_tokens": 13246298.0,
	"reward": 2.4005722999572754,
	"reward_std": 0.3679780662059784,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.974242091178894,
	"rewards/probe_shaping_dominance/std": 0.10219167917966843,
	"rewards/probe_terminal_raw/mean": 0.026549797505140305,
	"rewards/probe_terminal_raw/std": 0.10620416700839996,
	"rewards/rollout_reward_func/mean": -0.42521971464157104,
	"rewards/rollout_reward_func/std": 0.21645236015319824,
	"sampling/importance_sampling_ratio/max": 1.969668984413147,
	"sampling/importance_sampling_ratio/mean": 1.0500105619430542,
	"sampling/importance_sampling_ratio/min": 0.7689392566680908,
	"sampling/sampling_logp_difference/max": 0.6780328750610352,
	"sampling/sampling_logp_difference/mean": 0.02139047347009182,
	"step": 253,
	"step_time": 26.232431414999155
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.07854951097397134,
	"epoch": 0.00508,
	"grad_norm": 0.005968212615698576,
	"kl": 0.3778405386647137,
	"learning_rate": 7.99992959453806e-06,
	"loss": 0.0,
	"step": 254,
	"step_time": 12.017544923999594
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04191483659815276,
	"epoch": 0.0051,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004378916695713997,
	"kl": 0.3174490866222186,
	"learning_rate": 7.999928947137475e-06,
	"loss": -0.0,
	"num_tokens": 13351235.0,
	"reward": 2.3821582794189453,
	"reward_std": 0.4624309539794922,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9869383573532104,
	"rewards/probe_shaping_dominance/std": 0.07388784736394882,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.41415512561798096,
	"rewards/rollout_reward_func/std": 0.23873184621334076,
	"sampling/importance_sampling_ratio/max": 1.257253885269165,
	"sampling/importance_sampling_ratio/mean": 1.011238932609558,
	"sampling/importance_sampling_ratio/min": 0.9685202836990356,
	"sampling/sampling_logp_difference/max": 0.2289290428161621,
	"sampling/sampling_logp_difference/mean": 0.005532183218747377,
	"step": 255,
	"step_time": 28.14665811800114
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.044428632616472896,
	"epoch": 0.00512,
	"grad_norm": 0.001523565617389977,
	"kl": 0.3174588828405831,
	"learning_rate": 7.999928296774006e-06,
	"loss": -0.0,
	"step": 256,
	"step_time": 11.396023698001045
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.11287707928568125,
	"epoch": 0.00514,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0049528395757079124,
	"kl": 0.3751811153779272,
	"learning_rate": 7.999927643447652e-06,
	"loss": -0.0001,
	"num_tokens": 13453732.0,
	"reward": 2.2990427017211914,
	"reward_std": 0.4729869067668915,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.8994619250297546,
	"rewards/probe_shaping_dominance/std": 0.23433801531791687,
	"rewards/probe_terminal_raw/mean": 0.08892276883125305,
	"rewards/probe_terminal_raw/std": 0.1897670477628708,
	"rewards/rollout_reward_func/mean": -0.451841801404953,
	"rewards/rollout_reward_func/std": 0.3020572066307068,
	"sampling/importance_sampling_ratio/max": 1.7735323905944824,
	"sampling/importance_sampling_ratio/mean": 1.0311025381088257,
	"sampling/importance_sampling_ratio/min": 0.48170769214630127,
	"sampling/sampling_logp_difference/max": 0.5872056484222412,
	"sampling/sampling_logp_difference/mean": 0.03187928348779678,
	"step": 257,
	"step_time": 27.428863920001277
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.11026706825941801,
	"epoch": 0.00516,
	"grad_norm": 0.0036789914593100548,
	"kl": 0.37549637774645817,
	"learning_rate": 7.999926987158413e-06,
	"loss": -0.0001,
	"step": 258,
	"step_time": 12.307902244997422
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09494227101095021,
	"epoch": 0.00518,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.004995207767933607,
	"kl": 0.5894506504137098,
	"learning_rate": 7.999926327906292e-06,
	"loss": 0.0,
	"num_tokens": 13559320.0,
	"reward": 2.3814258575439453,
	"reward_std": 0.36968865990638733,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9921875,
	"rewards/probe_shaping_dominance/std": 0.04419417306780815,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.4826367497444153,
	"rewards/rollout_reward_func/std": 0.231715127825737,
	"sampling/importance_sampling_ratio/max": 1.2988759279251099,
	"sampling/importance_sampling_ratio/mean": 0.989588737487793,
	"sampling/importance_sampling_ratio/min": 0.3728586435317993,
	"sampling/sampling_logp_difference/max": 0.9864900708198547,
	"sampling/sampling_logp_difference/mean": 0.030208630487322807,
	"step": 259,
	"step_time": 28.526762178002173
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.09542630659416318,
	"epoch": 0.0052,
	"grad_norm": 0.009572784416377544,
	"kl": 0.5865388629335939,
	"learning_rate": 7.999925665691289e-06,
	"loss": 0.0,
	"step": 260,
	"step_time": 11.52395996999985
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.042740301505546086,
	"epoch": 0.00522,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0034757580142468214,
	"kl": 0.16234587341508444,
	"learning_rate": 7.999925000513405e-06,
	"loss": 0.0001,
	"num_tokens": 13662277.0,
	"reward": 2.3550405502319336,
	"reward_std": 0.3789060413837433,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9763131737709045,
	"rewards/probe_shaping_dominance/std": 0.09356633573770523,
	"rewards/probe_terminal_raw/mean": 0.023119919002056122,
	"rewards/probe_terminal_raw/std": 0.0910695344209671,
	"rewards/rollout_reward_func/mean": -0.4693926274776459,
	"rewards/rollout_reward_func/std": 0.27393800020217896,
	"sampling/importance_sampling_ratio/max": 1.9132373332977295,
	"sampling/importance_sampling_ratio/mean": 1.0334219932556152,
	"sampling/importance_sampling_ratio/min": 0.8748363256454468,
	"sampling/sampling_logp_difference/max": 0.648794412612915,
	"sampling/sampling_logp_difference/mean": 0.015361637808382511,
	"step": 261,
	"step_time": 27.68740953500128
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.043997991022479255,
	"epoch": 0.00524,
	"grad_norm": 0.0034889201633632183,
	"kl": 0.1585660980490502,
	"learning_rate": 7.999924332372639e-06,
	"loss": 0.0,
	"step": 262,
	"step_time": 12.369422526000562
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.06730000481184106,
	"epoch": 0.00526,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0019632691983133554,
	"kl": 0.2906430190632818,
	"learning_rate": 7.999923661268994e-06,
	"loss": -0.0,
	"num_tokens": 13768535.0,
	"reward": 2.461604356765747,
	"reward_std": 0.28569555282592773,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9790951013565063,
	"rewards/probe_shaping_dominance/std": 0.08612176775932312,
	"rewards/probe_terminal_raw/mean": 0.023373983800411224,
	"rewards/probe_terminal_raw/std": 0.09738598018884659,
	"rewards/rollout_reward_func/mean": -0.42836469411849976,
	"rewards/rollout_reward_func/std": 0.21179892122745514,
	"sampling/importance_sampling_ratio/max": 1.027362585067749,
	"sampling/importance_sampling_ratio/mean": 0.911888837814331,
	"sampling/importance_sampling_ratio/min": 0.0,
	"sampling/sampling_logp_difference/max": 1.2283318042755127,
	"sampling/sampling_logp_difference/mean": 0.04068940505385399,
	"step": 263,
	"step_time": 28.125288621000436
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.0668873688664462,
	"epoch": 0.00528,
	"grad_norm": 0.0020422539673745632,
	"kl": 0.30596065653662663,
	"learning_rate": 7.999922987202466e-06,
	"loss": -0.0,
	"step": 264,
	"step_time": 11.507015873000455
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05058241146616638,
	"epoch": 0.0053,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0018712878227233887,
	"kl": 0.39055716490838677,
	"learning_rate": 7.999922310173063e-06,
	"loss": -0.0,
	"num_tokens": 13871840.0,
	"reward": 2.4825406074523926,
	"reward_std": 0.31064870953559875,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9752524495124817,
	"rewards/probe_shaping_dominance/std": 0.09777678549289703,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.41146183013916016,
	"rewards/rollout_reward_func/std": 0.21425116062164307,
	"sampling/importance_sampling_ratio/max": 1.5599281787872314,
	"sampling/importance_sampling_ratio/mean": 1.0341243743896484,
	"sampling/importance_sampling_ratio/min": 0.8953186869621277,
	"sampling/sampling_logp_difference/max": 0.4449194669723511,
	"sampling/sampling_logp_difference/mean": 0.013410702347755432,
	"step": 265,
	"step_time": 27.96838706700055
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.04936367901973426,
	"epoch": 0.00532,
	"grad_norm": 0.006141372956335545,
	"kl": 0.3867563092110231,
	"learning_rate": 7.99992163018078e-06,
	"loss": -0.0,
	"step": 266,
	"step_time": 12.308435358998395
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05740413888270268,
	"epoch": 0.00534,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0028442663606256247,
	"kl": 0.3010439347126521,
	"learning_rate": 7.99992094722562e-06,
	"loss": -0.0,
	"num_tokens": 13974703.0,
	"reward": 2.375330924987793,
	"reward_std": 0.3971181809902191,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9330692291259766,
	"rewards/probe_shaping_dominance/std": 0.15932095050811768,
	"rewards/probe_terminal_raw/mean": 0.06885162740945816,
	"rewards/probe_terminal_raw/std": 0.1653386801481247,
	"rewards/rollout_reward_func/mean": -0.42034000158309937,
	"rewards/rollout_reward_func/std": 0.19739177823066711,
	"sampling/importance_sampling_ratio/max": 1.2114074230194092,
	"sampling/importance_sampling_ratio/mean": 0.9802918434143066,
	"sampling/importance_sampling_ratio/min": 0.3451912999153137,
	"sampling/sampling_logp_difference/max": 1.0613338947296143,
	"sampling/sampling_logp_difference/mean": 0.018370507284998894,
	"step": 267,
	"step_time": 27.86067632400045
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.055183965210744645,
	"epoch": 0.00536,
	"grad_norm": 0.0022630670573562384,
	"kl": 0.344313826324651,
	"learning_rate": 7.999920261307583e-06,
	"loss": -0.0,
	"step": 268,
	"step_time": 11.746586444000059
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.08427908451994881,
	"epoch": 0.00538,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0037011466920375824,
	"kl": 0.412635525688529,
	"learning_rate": 7.999919572426668e-06,
	"loss": -0.0,
	"num_tokens": 14078089.0,
	"reward": 2.4167308807373047,
	"reward_std": 0.32326242327690125,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9451819658279419,
	"rewards/probe_shaping_dominance/std": 0.147642120718956,
	"rewards/probe_terminal_raw/mean": 0.05843495950102806,
	"rewards/probe_terminal_raw/std": 0.15837596356868744,
	"rewards/rollout_reward_func/mean": -0.44313597679138184,
	"rewards/rollout_reward_func/std": 0.24654169380664825,
	"sampling/importance_sampling_ratio/max": 1.858984112739563,
	"sampling/importance_sampling_ratio/mean": 0.9879124164581299,
	"sampling/importance_sampling_ratio/min": 0.6056866645812988,
	"sampling/sampling_logp_difference/max": 0.6200296878814697,
	"sampling/sampling_logp_difference/mean": 0.027817152440547943,
	"step": 269,
	"step_time": 26.451382616000046
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.07716414582682773,
	"epoch": 0.0054,
	"grad_norm": 0.0030677285976707935,
	"kl": 0.4153696422581561,
	"learning_rate": 7.999918880582879e-06,
	"loss": -0.0,
	"step": 270,
	"step_time": 12.785874016998605
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04053633386229194,
	"epoch": 0.00542,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.001796143944375217,
	"kl": 0.5015344847925007,
	"learning_rate": 7.999918185776215e-06,
	"loss": 0.0,
	"num_tokens": 14181503.0,
	"reward": 2.4646096229553223,
	"reward_std": 0.2045918107032776,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9679263234138489,
	"rewards/probe_shaping_dominance/std": 0.1049569845199585,
	"rewards/probe_terminal_raw/mean": 0.0364583358168602,
	"rewards/probe_terminal_raw/std": 0.11773227155208588,
	"rewards/rollout_reward_func/mean": -0.4585248529911041,
	"rewards/rollout_reward_func/std": 0.16162419319152832,
	"sampling/importance_sampling_ratio/max": 1.4571605920791626,
	"sampling/importance_sampling_ratio/mean": 1.0197436809539795,
	"sampling/importance_sampling_ratio/min": 0.8846800923347473,
	"sampling/sampling_logp_difference/max": 0.3764890432357788,
	"sampling/sampling_logp_difference/mean": 0.012306122109293938,
	"step": 271,
	"step_time": 26.693239825001
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.038741875116102165,
	"epoch": 0.00544,
	"grad_norm": 0.0020677302964031696,
	"kl": 0.5029990994371474,
	"learning_rate": 7.999917488006676e-06,
	"loss": 0.0,
	"step": 272,
	"step_time": 11.444299719997616
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04818115712259896,
	"epoch": 0.00546,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.008343451656401157,
	"kl": 0.7089566249400381,
	"learning_rate": 7.999916787274264e-06,
	"loss": 0.0001,
	"num_tokens": 14287480.0,
	"reward": 2.4599452018737793,
	"reward_std": 0.38899266719818115,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9238950610160828,
	"rewards/probe_shaping_dominance/std": 0.16443566977977753,
	"rewards/probe_terminal_raw/mean": 0.08130080997943878,
	"rewards/probe_terminal_raw/std": 0.17714287340641022,
	"rewards/rollout_reward_func/mean": -0.3702506721019745,
	"rewards/rollout_reward_func/std": 0.21257071197032928,
	"sampling/importance_sampling_ratio/max": 2.423100471496582,
	"sampling/importance_sampling_ratio/mean": 1.0725514888763428,
	"sampling/importance_sampling_ratio/min": 0.8080363273620605,
	"sampling/sampling_logp_difference/max": 0.8850466012954712,
	"sampling/sampling_logp_difference/mean": 0.024975256994366646,
	"step": 273,
	"step_time": 28.09797250900101
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.04680645616099355,
	"epoch": 0.00548,
	"grad_norm": 0.003927062265574932,
	"kl": 0.742738697305322,
	"learning_rate": 7.99991608357898e-06,
	"loss": 0.0001,
	"step": 274,
	"step_time": 11.650237371000003
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04380835813935846,
	"epoch": 0.0055,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0025579470675438643,
	"kl": 0.21995878049926887,
	"learning_rate": 7.999915376920822e-06,
	"loss": -0.0,
	"num_tokens": 14387389.0,
	"reward": 2.2633914947509766,
	"reward_std": 0.42217421531677246,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9556913375854492,
	"rewards/probe_shaping_dominance/std": 0.1223374605178833,
	"rewards/probe_terminal_raw/mean": 0.0518292672932148,
	"rewards/probe_terminal_raw/std": 0.14265993237495422,
	"rewards/rollout_reward_func/mean": -0.5378788709640503,
	"rewards/rollout_reward_func/std": 0.23384462296962738,
	"sampling/importance_sampling_ratio/max": 1.084592580795288,
	"sampling/importance_sampling_ratio/mean": 0.9922658205032349,
	"sampling/importance_sampling_ratio/min": 0.7613502740859985,
	"sampling/sampling_logp_difference/max": 0.2726619839668274,
	"sampling/sampling_logp_difference/mean": 0.009103155694901943,
	"step": 275,
	"step_time": 26.459266137000668
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.03994250175310299,
	"epoch": 0.00552,
	"grad_norm": 0.0021381748374551535,
	"kl": 0.2157795349397702,
	"learning_rate": 7.999914667299794e-06,
	"loss": -0.0,
	"step": 276,
	"step_time": 11.672075437000785
	},
	{
	"clip_ratio/high_max": 0.05000000074505806,
	"clip_ratio/high_mean": 0.02500000037252903,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.037500000558793545,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.09868528880178928,
	"epoch": 0.00554,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0031571455765515566,
	"kl": 0.4792258571833372,
	"learning_rate": 7.999913954715895e-06,
	"loss": 0.0,
	"num_tokens": 14492025.0,
	"reward": 2.2542710304260254,
	"reward_std": 0.38688531517982483,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.5082289576530457,
	"rewards/rollout_reward_func/std": 0.17395071685314178,
	"sampling/importance_sampling_ratio/max": 1.9612770080566406,
	"sampling/importance_sampling_ratio/mean": 1.0468454360961914,
	"sampling/importance_sampling_ratio/min": 0.5976178646087646,
	"sampling/sampling_logp_difference/max": 0.7003155946731567,
	"sampling/sampling_logp_difference/mean": 0.032625701278448105,
	"step": 277,
	"step_time": 27.236174976000257
	},
	{
	"clip_ratio/high_max": 0.05000000074505806,
	"clip_ratio/high_mean": 0.02500000037252903,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.037500000558793545,
	"entropy": 0.09623363520950079,
	"epoch": 0.00556,
	"grad_norm": 0.0032991948537528515,
	"kl": 0.4749853519606404,
	"learning_rate": 7.999913239169126e-06,
	"loss": 0.0,
	"step": 278,
	"step_time": 12.07052038799975
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.8125,
	"completions/mean_terminated_length": 2.8125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04121039004530758,
	"epoch": 0.00558,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0032093473710119724,
	"kl": 0.6897661700841127,
	"learning_rate": 7.999912520659488e-06,
	"loss": 0.0,
	"num_tokens": 14593223.0,
	"reward": 2.3469300270080566,
	"reward_std": 0.5208548307418823,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.8125,
	"rewards/probe_completion_length/std": 0.3965577781200409,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.95980304479599,
	"rewards/probe_shaping_dominance/std": 0.12795211374759674,
	"rewards/probe_terminal_raw/mean": 0.042174797505140305,
	"rewards/probe_terminal_raw/std": 0.13503843545913696,
	"rewards/rollout_reward_func/mean": -0.44879791140556335,
	"rewards/rollout_reward_func/std": 0.2045743763446808,
	"sampling/importance_sampling_ratio/max": 1.9838464260101318,
	"sampling/importance_sampling_ratio/mean": 1.0156028270721436,
	"sampling/importance_sampling_ratio/min": 0.1315358281135559,
	"sampling/sampling_logp_difference/max": 2.028473377227783,
	"sampling/sampling_logp_difference/mean": 0.03758270666003227,
	"step": 279,
	"step_time": 26.44161211500159
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.033333334140479565,
	"entropy": 0.047012478462420404,
	"epoch": 0.0056,
	"grad_norm": 0.0013261314015835524,
	"kl": 0.7127395562856691,
	"learning_rate": 7.99991179918698e-06,
	"loss": -0.0,
	"step": 280,
	"step_time": 11.634762280001269
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.012499196142016444,
	"epoch": 0.00562,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0003787693567574024,
	"kl": 0.669078703969717,
	"learning_rate": 7.999911074751606e-06,
	"loss": -0.0,
	"num_tokens": 14693012.0,
	"reward": 2.4939217567443848,
	"reward_std": 0.381552517414093,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9186484813690186,
	"rewards/probe_shaping_dominance/std": 0.195449098944664,
	"rewards/probe_terminal_raw/mean": 0.078125,
	"rewards/probe_terminal_raw/std": 0.18445101380348206,
	"rewards/rollout_reward_func/mean": -0.3903515338897705,
	"rewards/rollout_reward_func/std": 0.2618943452835083,
	"sampling/importance_sampling_ratio/max": 1.0298659801483154,
	"sampling/importance_sampling_ratio/mean": 0.9976564645767212,
	"sampling/importance_sampling_ratio/min": 0.9420029520988464,
	"sampling/sampling_logp_difference/max": 0.05974767729640007,
	"sampling/sampling_logp_difference/mean": 0.0016555668553337455,
	"step": 281,
	"step_time": 26.723270941998635
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.012372259192488855,
	"epoch": 0.00564,
	"grad_norm": 0.0003435203689150512,
	"kl": 0.6690934834768996,
	"learning_rate": 7.999910347353363e-06,
	"loss": -0.0,
	"step": 282,
	"step_time": 11.794659334002063
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.84375,
	"completions/mean_terminated_length": 2.84375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.045614961185492575,
	"epoch": 0.00566,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.003150342497974634,
	"kl": 0.48013901670856285,
	"learning_rate": 7.999909616992255e-06,
	"loss": -0.0,
	"num_tokens": 14799672.0,
	"reward": 2.3399429321289062,
	"reward_std": 0.4422038793563843,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.84375,
	"rewards/probe_completion_length/std": 0.3689020276069641,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 1.0,
	"rewards/probe_shaping_dominance/std": 0.0,
	"rewards/probe_terminal_raw/mean": 0.0,
	"rewards/probe_terminal_raw/std": 0.0,
	"rewards/rollout_reward_func/mean": -0.45380693674087524,
	"rewards/rollout_reward_func/std": 0.1835639625787735,
	"sampling/importance_sampling_ratio/max": 1.2092225551605225,
	"sampling/importance_sampling_ratio/mean": 0.9782531261444092,
	"sampling/importance_sampling_ratio/min": 0.3157159686088562,
	"sampling/sampling_logp_difference/max": 1.1528494358062744,
	"sampling/sampling_logp_difference/mean": 0.019979460164904594,
	"step": 283,
	"step_time": 27.036351400000058
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.03985181718599051,
	"epoch": 0.00568,
	"grad_norm": 0.0033008423633873463,
	"kl": 0.49970418894372415,
	"learning_rate": 7.99990888366828e-06,
	"loss": -0.0,
	"step": 284,
	"step_time": 11.668078124000203
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.03972258236899506,
	"epoch": 0.0057,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.002630846342071891,
	"kl": 0.3517824411392212,
	"learning_rate": 7.99990814738144e-06,
	"loss": -0.0,
	"num_tokens": 14902831.0,
	"reward": 2.4359757900238037,
	"reward_std": 0.2911105751991272,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9871374368667603,
	"rewards/probe_shaping_dominance/std": 0.07276186347007751,
	"rewards/probe_terminal_raw/mean": 0.015625,
	"rewards/probe_terminal_raw/std": 0.0883883461356163,
	"rewards/rollout_reward_func/mean": -0.48553669452667236,
	"rewards/rollout_reward_func/std": 0.2099909633398056,
	"sampling/importance_sampling_ratio/max": 1.558259129524231,
	"sampling/importance_sampling_ratio/mean": 1.021366834640503,
	"sampling/importance_sampling_ratio/min": 0.757884681224823,
	"sampling/sampling_logp_difference/max": 0.4435689449310303,
	"sampling/sampling_logp_difference/mean": 0.011840267106890678,
	"step": 285,
	"step_time": 27.424052481000217
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.040461032156599686,
	"epoch": 0.00572,
	"grad_norm": 0.002737229922786355,
	"kl": 0.3537818659096956,
	"learning_rate": 7.999907408131737e-06,
	"loss": -0.0,
	"step": 286,
	"step_time": 12.126654321001297
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.96875,
	"completions/mean_terminated_length": 2.96875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.020485240605921717,
	"epoch": 0.00574,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.000876868492923677,
	"kl": 0.23688423214722576,
	"learning_rate": 7.999906665919169e-06,
	"loss": -0.0,
	"num_tokens": 15005261.0,
	"reward": 2.5098652839660645,
	"reward_std": 0.30707597732543945,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.96875,
	"rewards/probe_completion_length/std": 0.1767766922712326,
	"rewards/probe_invalid_count/mean": 0.03125,
	"rewards/probe_invalid_count/std": 0.1767766922712326,
	"rewards/probe_shaping_dominance/mean": 0.9733736515045166,
	"rewards/probe_shaping_dominance/std": 0.10744811594486237,
	"rewards/probe_terminal_raw/mean": 0.0260416679084301,
	"rewards/probe_terminal_raw/std": 0.1046360433101654,
	"rewards/rollout_reward_func/mean": -0.4395501911640167,
	"rewards/rollout_reward_func/std": 0.18828870356082916,
	"sampling/importance_sampling_ratio/max": 1.0840176343917847,
	"sampling/importance_sampling_ratio/mean": 1.0012118816375732,
	"sampling/importance_sampling_ratio/min": 0.9655031561851501,
	"sampling/sampling_logp_difference/max": 0.08256775140762329,
	"sampling/sampling_logp_difference/mean": 0.00235398905351758,
	"step": 287,
	"step_time": 27.075085327001034
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.022996263058303157,
	"epoch": 0.00576,
	"grad_norm": 0.0009354232461191714,
	"kl": 0.23660576696175895,
	"learning_rate": 7.99990592074374e-06,
	"loss": -0.0,
	"step": 288,
	"step_time": 11.657212093999078
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.05316271091851377,
	"epoch": 0.00578,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.006305683869868517,
	"kl": 0.2035164695232652,
	"learning_rate": 7.999905172605446e-06,
	"loss": -0.0001,
	"num_tokens": 15107252.0,
	"reward": 2.422664165496826,
	"reward_std": 0.37807923555374146,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9658713340759277,
	"rewards/probe_shaping_dominance/std": 0.10942408442497253,
	"rewards/probe_terminal_raw/mean": 0.03315548598766327,
	"rewards/probe_terminal_raw/std": 0.1095743477344513,
	"rewards/rollout_reward_func/mean": -0.40136268734931946,
	"rewards/rollout_reward_func/std": 0.2093636691570282,
	"sampling/importance_sampling_ratio/max": 1.5805177688598633,
	"sampling/importance_sampling_ratio/mean": 1.0220205783843994,
	"sampling/importance_sampling_ratio/min": 0.7326148748397827,
	"sampling/sampling_logp_difference/max": 0.4577510356903076,
	"sampling/sampling_logp_difference/mean": 0.019495096057653427,
	"step": 289,
	"step_time": 26.987616914999307
	},
	{
	"clip_ratio/high_max": 0.04583333432674408,
	"clip_ratio/high_mean": 0.02291666716337204,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02291666716337204,
	"entropy": 0.05469944020660478,
	"epoch": 0.0058,
	"grad_norm": 0.0032733359839767218,
	"kl": 0.18666235760611016,
	"learning_rate": 7.999904421504293e-06,
	"loss": -0.0001,
	"step": 290,
	"step_time": 11.951281235001261
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.02291666716337204,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.035416667349636555,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.0591709428122158,
	"epoch": 0.00582,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0026493356563150883,
	"kl": 0.8575776647776365,
	"learning_rate": 7.999903667440278e-06,
	"loss": 0.0,
	"num_tokens": 15208793.0,
	"reward": 2.402831792831421,
	"reward_std": 0.3910689353942871,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9471915364265442,
	"rewards/probe_shaping_dominance/std": 0.1431892067193985,
	"rewards/probe_terminal_raw/mean": 0.0520833358168602,
	"rewards/probe_terminal_raw/std": 0.1433027982711792,
	"rewards/rollout_reward_func/mean": -0.42144304513931274,
	"rewards/rollout_reward_func/std": 0.21596133708953857,
	"sampling/importance_sampling_ratio/max": 1.0310035943984985,
	"sampling/importance_sampling_ratio/mean": 0.9701290130615234,
	"sampling/importance_sampling_ratio/min": 0.5706773400306702,
	"sampling/sampling_logp_difference/max": 0.5609317421913147,
	"sampling/sampling_logp_difference/mean": 0.014490557834506035,
	"step": 291,
	"step_time": 27.075510762000704
	},
	{
	"clip_ratio/high_max": 0.02500000037252903,
	"clip_ratio/high_mean": 0.012500000186264515,
	"clip_ratio/low_mean": 0.012500000186264515,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02500000037252903,
	"entropy": 0.05919087287338698,
	"epoch": 0.00584,
	"grad_norm": 0.0026768911629915237,
	"kl": 0.8454538804168692,
	"learning_rate": 7.999902910413404e-06,
	"loss": 0.0,
	"step": 292,
	"step_time": 12.032383580999522
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04045550918681329,
	"epoch": 0.00586,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.00724328076466918,
	"kl": 0.80053227301687,
	"learning_rate": 7.999902150423671e-06,
	"loss": -0.0001,
	"num_tokens": 15311233.0,
	"reward": 2.4362893104553223,
	"reward_std": 0.426661878824234,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9734768271446228,
	"rewards/probe_shaping_dominance/std": 0.10647083818912506,
	"rewards/probe_terminal_raw/mean": 0.026295732706785202,
	"rewards/probe_terminal_raw/std": 0.10541322082281113,
	"rewards/rollout_reward_func/mean": -0.38848331570625305,
	"rewards/rollout_reward_func/std": 0.2122591733932495,
	"sampling/importance_sampling_ratio/max": 1.8292688131332397,
	"sampling/importance_sampling_ratio/mean": 1.001596212387085,
	"sampling/importance_sampling_ratio/min": 0.44141146540641785,
	"sampling/sampling_logp_difference/max": 0.8177778720855713,
	"sampling/sampling_logp_difference/mean": 0.025196455419063568,
	"step": 293,
	"step_time": 27.23413759199957
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.010416666977107525,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.02083333395421505,
	"entropy": 0.04230553897491518,
	"epoch": 0.00588,
	"grad_norm": 0.005148016382008791,
	"kl": 0.6622665030881763,
	"learning_rate": 7.999901387471079e-06,
	"loss": -0.0001,
	"step": 294,
	"step_time": 11.526401772997815
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.875,
	"completions/mean_terminated_length": 2.875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.03366142028335162,
	"epoch": 0.0059,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.005694986321032047,
	"kl": 0.39196249035501296,
	"learning_rate": 7.99990062155563e-06,
	"loss": 0.0,
	"num_tokens": 15421347.0,
	"reward": 2.391371726989746,
	"reward_std": 0.43072906136512756,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.875,
	"rewards/probe_completion_length/std": 0.33601075410842896,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9607213735580444,
	"rewards/probe_shaping_dominance/std": 0.12690994143486023,
	"rewards/probe_terminal_raw/mean": 0.046875,
	"rewards/probe_terminal_raw/std": 0.1480722874403,
	"rewards/rollout_reward_func/mean": -0.4412246346473694,
	"rewards/rollout_reward_func/std": 0.21457210183143616,
	"sampling/importance_sampling_ratio/max": 1.2205545902252197,
	"sampling/importance_sampling_ratio/mean": 0.9986574053764343,
	"sampling/importance_sampling_ratio/min": 0.7592641115188599,
	"sampling/sampling_logp_difference/max": 0.2809281349182129,
	"sampling/sampling_logp_difference/mean": 0.008172519505023956,
	"step": 295,
	"step_time": 26.643844086999707
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.030476719188300194,
	"epoch": 0.00592,
	"grad_norm": 0.005326179787516594,
	"kl": 0.39566947892306814,
	"learning_rate": 7.999899852677322e-06,
	"loss": 0.0,
	"step": 296,
	"step_time": 12.454534126997714
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.90625,
	"completions/mean_terminated_length": 2.90625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.04483710537169827,
	"epoch": 0.00594,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0021372437477111816,
	"kl": 0.4166623194081088,
	"learning_rate": 7.99989908083616e-06,
	"loss": 0.0,
	"num_tokens": 15523076.0,
	"reward": 2.4664759635925293,
	"reward_std": 0.4568862318992615,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.90625,
	"rewards/probe_completion_length/std": 0.2961445748806,
	"rewards/probe_invalid_count/mean": 0.0625,
	"rewards/probe_invalid_count/std": 0.24593468010425568,
	"rewards/probe_shaping_dominance/mean": 0.9591568112373352,
	"rewards/probe_shaping_dominance/std": 0.11802849918603897,
	"rewards/probe_terminal_raw/mean": 0.04509654641151428,
	"rewards/probe_terminal_raw/std": 0.1317683309316635,
	"rewards/rollout_reward_func/mean": -0.4565274119377136,
	"rewards/rollout_reward_func/std": 0.26263633370399475,
	"sampling/importance_sampling_ratio/max": 1.3225888013839722,
	"sampling/importance_sampling_ratio/mean": 1.0174564123153687,
	"sampling/importance_sampling_ratio/min": 0.8623110055923462,
	"sampling/sampling_logp_difference/max": 0.27959030866622925,
	"sampling/sampling_logp_difference/mean": 0.008479975163936615,
	"step": 297,
	"step_time": 26.703501694998522
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"entropy": 0.04334457405639114,
	"epoch": 0.00596,
	"grad_norm": 0.004324762150645256,
	"kl": 0.41364979138597846,
	"learning_rate": 7.999898306032144e-06,
	"loss": 0.0,
	"step": 298,
	"step_time": 11.624797897999088
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 3.0,
	"completions/max_terminated_length": 3.0,
	"completions/mean_length": 2.9375,
	"completions/mean_terminated_length": 2.9375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 0.039461553949308836,
	"epoch": 0.00598,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0021312020253390074,
	"kl": 0.4295559982638224,
	"learning_rate": 7.999897528265272e-06,
	"loss": 0.0,
	"num_tokens": 15625505.0,
	"reward": 2.4885663986206055,
	"reward_std": 0.32209959626197815,
	"rewards/format_guard/mean": -0.05000000074505806,
	"rewards/format_guard/std": 0.0,
	"rewards/probe_completion_length/mean": 1.9375,
	"rewards/probe_completion_length/std": 0.24593468010425568,
	"rewards/probe_invalid_count/mean": 0.0,
	"rewards/probe_invalid_count/std": 0.0,
	"rewards/probe_shaping_dominance/mean": 0.9723982810974121,
	"rewards/probe_shaping_dominance/std": 0.10864228010177612,
	"rewards/probe_terminal_raw/mean": 0.03125,
	"rewards/probe_terminal_raw/std": 0.12296734005212784,
	"rewards/rollout_reward_func/mean": -0.40258198976516724,
	"rewards/rollout_reward_func/std": 0.1721213161945343,
	"sampling/importance_sampling_ratio/max": 1.015625,
	"sampling/importance_sampling_ratio/mean": 0.9557619690895081,
	"sampling/importance_sampling_ratio/min": 0.3387709856033325,
	"sampling/sampling_logp_difference/max": 1.0839133262634277,
	"sampling/sampling_logp_difference/mean": 0.021332627162337303,
	"step": 299,
	"step_time": 26.165180010000768
	},
	{
	"clip_ratio/high_max": 0.02083333395421505,
	"clip_ratio/high_mean": 0.010416666977107525,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.010416666977107525,
	"entropy": 0.04107913846030442,
	"epoch": 0.006,
	"grad_norm": 0.0022343825548887253,
	"kl": 0.42880946584045887,
	"learning_rate": 7.999896747535546e-06,
	"loss": 0.0,
	"step": 300,
	"step_time": 12.217135184999279
	}
	],
	"logging_steps": 1.0,
	"max_steps": 100000,
	"num_input_tokens_seen": 15625505,
	"num_train_epochs": 2,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}